File size: 13,383 Bytes
655c38a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import os
import json
import csv
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# --- Configuration ---
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
    raise ValueError("GOOGLE_API_KEY not found in environment variables.")

# --- Crawler Configuration ---
START_URL = "https://www.shl.com/solutions/products/product-catalog/"
BASE_CATALOG_URL_PREFIX = "https://www.shl.com/solutions/products/product-catalog/"
# Define allowed URL patterns based on user input
ALLOWED_PATTERNS = [
    re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"$"), # Exact base URL
    re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"\?start=\d+"), # Pagination URLs
    re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"view/"), # Detail View URLs
]
MAX_PAGES_TO_CRAWL = 200 # Safety limit (adjust as needed)
RAW_DATA_FILENAME = "shl_raw_scraped_data_specific.jsonl" # New file for specific crawl

# --- Processing & Output Configuration ---
PROCESSED_JSON_FILENAME = "shl_processed_analysis_specific.json" # New file
PROCESSED_CSV_FILENAME = "shl_processed_analysis_specific.csv" # New file
POLITE_DELAY_SECONDS = 1
REQUEST_TIMEOUT = 30
MAX_LLM_CONTENT_CHARS = 15000
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

# --- LLM Setup ---
# Re-initialize LLM components (same as before)
llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it", google_api_key=google_api_key)
prompt_template = ChatPromptTemplate.from_template(
    """
    Based *only* on the following text content scraped from an SHL web page, please provide:
    1. A concise summary of the page's main topic (2-4 sentences).
    2. If it describes an assessment or product, list its key features, benefits, or what it measures (up to 5 bullet points). Otherwise, state "Not applicable".

    Do not add any information not present in the text. If the text is insufficient or irrelevant, state that.

    Scraped Content:
    {context}

    Analysis:
    """
)
output_parser = StrOutputParser()
llm_chain = {"context": RunnablePassthrough()} | prompt_template | llm | output_parser

# --- Helper Functions ---

def get_soup(url):
    """Fetches URL content and returns a BeautifulSoup object or None on error."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        response.raise_for_status()
        content_type = response.headers.get('content-type', '').lower()
        if 'html' not in content_type:
            print(f"  Skipping URL {url}: Non-HTML content type ({content_type})")
            return None, None
        html_content = response.content.decode('utf-8', errors='ignore')
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup, response.text
    except requests.exceptions.Timeout:
        print(f"  Timeout error fetching {url}")
        return None, None
    except requests.exceptions.RequestException as e:
        print(f"  Request error fetching {url}: {e}")
        return None, None
    except Exception as e:
        print(f"  Error processing {url}: {e}")
        return None, None

def is_allowed_shl_url(url):
    """Checks if the URL matches one of the defined allowed patterns."""
    # Simple check first
    if not url or not url.startswith(BASE_CATALOG_URL_PREFIX):
        return False
    # Check against regex patterns
    for pattern in ALLOWED_PATTERNS:
        if pattern.match(url):
            return True
    # print(f"    Debug: URL rejected by patterns: {url}") # Optional debug
    return False

def extract_text_from_html(html_content):
    """Extracts and cleans text from raw HTML string."""
    if not html_content: return None
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']): # Added form removal
            element.decompose()
        body = soup.body
        text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    except Exception as e:
        print(f"  Error extracting text from HTML snippet: {e}")
        return None

# --- Phase 1: Crawl & Scrape Raw HTML (Specific URLs) ---

def crawl_and_scrape_raw_specific(start_url, max_pages, output_filename):
    print(f"--- Starting Phase 1: Crawling Specific SHL URLs (Max: {max_pages} pages) ---")
    queue = deque([start_url])
    visited_urls = {start_url}
    pages_scraped = 0

    with open(output_filename, 'w') as f: # Clear/prepare output file
        f.write("")
    print(f"Cleared/Prepared raw data file: {output_filename}")

    while queue and pages_scraped < max_pages:
        current_url = queue.popleft()
        print(f"\nProcessing ({pages_scraped + 1}/{max_pages}): {current_url}")

        # Check BEFORE fetching if URL is allowed (it should be if it came from queue, but good practice)
        if not is_allowed_shl_url(current_url):
             print(f"  Skipping non-allowed URL from queue (should not happen): {current_url}")
             continue

        soup, raw_html = get_soup(current_url)

        if raw_html:
            try:
                with open(output_filename, 'a', encoding='utf-8') as f_out:
                    json.dump({"url": current_url, "raw_html": raw_html}, f_out)
                    f_out.write('\n')
                pages_scraped += 1
                print(f"  Successfully scraped and saved raw HTML ({len(raw_html)} bytes).")
            except Exception as e:
                print(f"  Error saving raw data for {current_url}: {e}")

        if soup:
            links_found_on_page = 0
            new_links_added = 0
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_url = urljoin(current_url, href)
                parsed_url = urlparse(absolute_url)
                normalized_url = parsed_url._replace(fragment="").geturl() # Remove fragment
                links_found_on_page += 1

                # *** Crucial Change: Check against specific allowed patterns ***
                if is_allowed_shl_url(normalized_url) and normalized_url not in visited_urls:
                    visited_urls.add(normalized_url)
                    queue.append(normalized_url)
                    new_links_added += 1
            print(f"  Inspected {links_found_on_page} links, added {new_links_added} new valid URLs to queue.")

        print(f"  Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
        time.sleep(POLITE_DELAY_SECONDS)

    print(f"\n--- Phase 1 Complete: Scraped {pages_scraped} pages matching allowed patterns. ---")
    print(f"Raw data saved incrementally to {output_filename}")


# --- Phase 2: Process Raw Data & AI Analysis (No Changes Needed) ---

def process_raw_data_with_ai(raw_data_input_filename):
    print(f"\n--- Starting Phase 2: Processing Raw Data from {raw_data_input_filename} & AI Analysis ---")
    processed_results = []
    processed_count = 0
    error_count = 0

    try:
        with open(raw_data_input_filename, 'r', encoding='utf-8') as f_in:
            for line_num, line in enumerate(f_in, 1):
                processed_count += 1
                url = f"Unknown (Line {line_num})" # Default if parsing fails
                try:
                    data = json.loads(line)
                    url = data.get("url", f"Unknown (Line {line_num})")
                    raw_html = data.get("raw_html")
                    print(f"\nProcessing item {processed_count}: {url}")

                    if not url or not raw_html:
                        print("  Skipping: Missing URL or raw HTML in record.")
                        error_count += 1
                        processed_results.append({
                            "url": url, "extracted_text": None, "ai_analysis": None,
                            "processing_status": "Error: Invalid Raw Data Record" })
                        continue

                    print("  Extracting text from raw HTML...")
                    extracted_text = extract_text_from_html(raw_html)
                    ai_analysis = None
                    if not extracted_text:
                        print("  Failed to extract text.")
                        status = "Error: Text Extraction Failed"
                    else:
                        print(f"  Extracted ~{len(extracted_text)} characters. Sending to AI...")
                        try:
                            truncated_text = extracted_text
                            if len(extracted_text) > MAX_LLM_CONTENT_CHARS:
                                truncated_text = extracted_text[:MAX_LLM_CONTENT_CHARS] + "... (truncated)"
                                print(f"    Text truncated to {MAX_LLM_CONTENT_CHARS} chars for LLM.")

                            ai_analysis = llm_chain.invoke(truncated_text)
                            print("  AI analysis received.")
                            status = "Success: Analyzed"
                            print(f"  Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
                            time.sleep(POLITE_DELAY_SECONDS)

                        except Exception as e:
                            print(f"  Error during AI analysis: {e}")
                            ai_analysis = f"Error during AI analysis: {e}"
                            status = f"Error: AI Failed ({type(e).__name__})"
                            error_count += 1

                    processed_results.append({
                        "url": url, "extracted_text": extracted_text, "ai_analysis": ai_analysis,
                        "processing_status": status })

                except json.JSONDecodeError as e:
                    print(f"  Skipping invalid JSON line {line_num}: {e}")
                    error_count += 1
                    processed_results.append({ # Add error record
                         "url": f"Unknown (Line {line_num})", "extracted_text": None, "ai_analysis": None,
                         "processing_status": "Error: Invalid JSON in Raw Data" })
                    continue
                except Exception as e:
                     print(f"  Unexpected error processing line {line_num} ({url}): {e}")
                     error_count += 1
                     processed_results.append({
                        "url": url, "extracted_text": None, "ai_analysis": None,
                        "processing_status": f"Error: Unexpected Processing Failure ({type(e).__name__})" })
                     continue

    except FileNotFoundError:
        print(f"Error: Raw data file '{raw_data_input_filename}' not found. Cannot proceed.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred while reading raw data file: {e}")
        return processed_results

    print(f"\n--- Phase 2 Complete: Processed {processed_count} records with {error_count} errors. ---")
    return processed_results

# --- Phase 3: Save Processed Results (No Changes Needed) ---

def save_processed_results(final_data, json_filename, csv_filename):
    """Saves the final processed data to JSON and CSV files."""
    print("\n--- Starting Phase 3: Saving Processed Results ---")
    if not final_data:
        print("No processed data to save.")
        return

    all_keys = set()
    for item in final_data: all_keys.update(item.keys())
    fieldnames = sorted(list(all_keys))

    # Save to JSON
    try:
        with open(json_filename, 'w', encoding='utf-8') as f_json:
            json.dump(final_data, f_json, ensure_ascii=False, indent=4)
        print(f"Successfully saved processed JSON results to: {json_filename}")
    except Exception as e:
        print(f"Error saving processed data to JSON file ({json_filename}): {e}")

    # Save to CSV
    try:
        with open(csv_filename, 'w', newline='', encoding='utf-8') as f_csv:
            writer = csv.DictWriter(f_csv, fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            writer.writerows(final_data)
        print(f"Successfully saved processed CSV results to: {csv_filename}")
    except Exception as e:
        print(f"Error saving processed data to CSV file ({csv_filename}): {e}")

    print("\n--- Phase 3 Complete ---")


# --- Main Execution ---
if __name__ == "__main__":
    print("Starting Specific SHL URL Crawler and Analyzer...")

    # # --- Phase 1 ---
    # crawl_and_scrape_raw_specific(
    #     start_url=START_URL,
    #     max_pages=MAX_PAGES_TO_CRAWL,
    #     output_filename=RAW_DATA_FILENAME
    # )

    # --- Phase 2 ---
    processed_data = process_raw_data_with_ai(RAW_DATA_FILENAME)

    # --- Phase 3 ---
    save_processed_results(
        final_data=processed_data,
        json_filename=PROCESSED_JSON_FILENAME,
        csv_filename=PROCESSED_CSV_FILENAME
    )

    print("\nScript finished.")