SHL / test.py
joker7094's picture
Initial commit with LFS-tracked index file
655c38a
import os
import json
import csv
import time
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# --- Configuration ---
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
raise ValueError("GOOGLE_API_KEY not found in environment variables.")
# --- Crawler Configuration ---
START_URL = "https://www.shl.com/solutions/products/product-catalog/"
BASE_CATALOG_URL_PREFIX = "https://www.shl.com/solutions/products/product-catalog/"
# Define allowed URL patterns based on user input
ALLOWED_PATTERNS = [
re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"$"), # Exact base URL
re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"\?start=\d+"), # Pagination URLs
re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"view/"), # Detail View URLs
]
MAX_PAGES_TO_CRAWL = 200 # Safety limit (adjust as needed)
RAW_DATA_FILENAME = "shl_raw_scraped_data_specific.jsonl" # New file for specific crawl
# --- Processing & Output Configuration ---
PROCESSED_JSON_FILENAME = "shl_processed_analysis_specific.json" # New file
PROCESSED_CSV_FILENAME = "shl_processed_analysis_specific.csv" # New file
POLITE_DELAY_SECONDS = 1
REQUEST_TIMEOUT = 30
MAX_LLM_CONTENT_CHARS = 15000
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
# --- LLM Setup ---
# Re-initialize LLM components (same as before)
llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it", google_api_key=google_api_key)
prompt_template = ChatPromptTemplate.from_template(
"""
Based *only* on the following text content scraped from an SHL web page, please provide:
1. A concise summary of the page's main topic (2-4 sentences).
2. If it describes an assessment or product, list its key features, benefits, or what it measures (up to 5 bullet points). Otherwise, state "Not applicable".
Do not add any information not present in the text. If the text is insufficient or irrelevant, state that.
Scraped Content:
{context}
Analysis:
"""
)
output_parser = StrOutputParser()
llm_chain = {"context": RunnablePassthrough()} | prompt_template | llm | output_parser
# --- Helper Functions ---
def get_soup(url):
"""Fetches URL content and returns a BeautifulSoup object or None on error."""
try:
response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
response.raise_for_status()
content_type = response.headers.get('content-type', '').lower()
if 'html' not in content_type:
print(f" Skipping URL {url}: Non-HTML content type ({content_type})")
return None, None
html_content = response.content.decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
return soup, response.text
except requests.exceptions.Timeout:
print(f" Timeout error fetching {url}")
return None, None
except requests.exceptions.RequestException as e:
print(f" Request error fetching {url}: {e}")
return None, None
except Exception as e:
print(f" Error processing {url}: {e}")
return None, None
def is_allowed_shl_url(url):
"""Checks if the URL matches one of the defined allowed patterns."""
# Simple check first
if not url or not url.startswith(BASE_CATALOG_URL_PREFIX):
return False
# Check against regex patterns
for pattern in ALLOWED_PATTERNS:
if pattern.match(url):
return True
# print(f" Debug: URL rejected by patterns: {url}") # Optional debug
return False
def extract_text_from_html(html_content):
"""Extracts and cleans text from raw HTML string."""
if not html_content: return None
try:
soup = BeautifulSoup(html_content, 'html.parser')
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']): # Added form removal
element.decompose()
body = soup.body
text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip()
return text
except Exception as e:
print(f" Error extracting text from HTML snippet: {e}")
return None
# --- Phase 1: Crawl & Scrape Raw HTML (Specific URLs) ---
def crawl_and_scrape_raw_specific(start_url, max_pages, output_filename):
print(f"--- Starting Phase 1: Crawling Specific SHL URLs (Max: {max_pages} pages) ---")
queue = deque([start_url])
visited_urls = {start_url}
pages_scraped = 0
with open(output_filename, 'w') as f: # Clear/prepare output file
f.write("")
print(f"Cleared/Prepared raw data file: {output_filename}")
while queue and pages_scraped < max_pages:
current_url = queue.popleft()
print(f"\nProcessing ({pages_scraped + 1}/{max_pages}): {current_url}")
# Check BEFORE fetching if URL is allowed (it should be if it came from queue, but good practice)
if not is_allowed_shl_url(current_url):
print(f" Skipping non-allowed URL from queue (should not happen): {current_url}")
continue
soup, raw_html = get_soup(current_url)
if raw_html:
try:
with open(output_filename, 'a', encoding='utf-8') as f_out:
json.dump({"url": current_url, "raw_html": raw_html}, f_out)
f_out.write('\n')
pages_scraped += 1
print(f" Successfully scraped and saved raw HTML ({len(raw_html)} bytes).")
except Exception as e:
print(f" Error saving raw data for {current_url}: {e}")
if soup:
links_found_on_page = 0
new_links_added = 0
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(current_url, href)
parsed_url = urlparse(absolute_url)
normalized_url = parsed_url._replace(fragment="").geturl() # Remove fragment
links_found_on_page += 1
# *** Crucial Change: Check against specific allowed patterns ***
if is_allowed_shl_url(normalized_url) and normalized_url not in visited_urls:
visited_urls.add(normalized_url)
queue.append(normalized_url)
new_links_added += 1
print(f" Inspected {links_found_on_page} links, added {new_links_added} new valid URLs to queue.")
print(f" Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
time.sleep(POLITE_DELAY_SECONDS)
print(f"\n--- Phase 1 Complete: Scraped {pages_scraped} pages matching allowed patterns. ---")
print(f"Raw data saved incrementally to {output_filename}")
# --- Phase 2: Process Raw Data & AI Analysis (No Changes Needed) ---
def process_raw_data_with_ai(raw_data_input_filename):
print(f"\n--- Starting Phase 2: Processing Raw Data from {raw_data_input_filename} & AI Analysis ---")
processed_results = []
processed_count = 0
error_count = 0
try:
with open(raw_data_input_filename, 'r', encoding='utf-8') as f_in:
for line_num, line in enumerate(f_in, 1):
processed_count += 1
url = f"Unknown (Line {line_num})" # Default if parsing fails
try:
data = json.loads(line)
url = data.get("url", f"Unknown (Line {line_num})")
raw_html = data.get("raw_html")
print(f"\nProcessing item {processed_count}: {url}")
if not url or not raw_html:
print(" Skipping: Missing URL or raw HTML in record.")
error_count += 1
processed_results.append({
"url": url, "extracted_text": None, "ai_analysis": None,
"processing_status": "Error: Invalid Raw Data Record" })
continue
print(" Extracting text from raw HTML...")
extracted_text = extract_text_from_html(raw_html)
ai_analysis = None
if not extracted_text:
print(" Failed to extract text.")
status = "Error: Text Extraction Failed"
else:
print(f" Extracted ~{len(extracted_text)} characters. Sending to AI...")
try:
truncated_text = extracted_text
if len(extracted_text) > MAX_LLM_CONTENT_CHARS:
truncated_text = extracted_text[:MAX_LLM_CONTENT_CHARS] + "... (truncated)"
print(f" Text truncated to {MAX_LLM_CONTENT_CHARS} chars for LLM.")
ai_analysis = llm_chain.invoke(truncated_text)
print(" AI analysis received.")
status = "Success: Analyzed"
print(f" Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
time.sleep(POLITE_DELAY_SECONDS)
except Exception as e:
print(f" Error during AI analysis: {e}")
ai_analysis = f"Error during AI analysis: {e}"
status = f"Error: AI Failed ({type(e).__name__})"
error_count += 1
processed_results.append({
"url": url, "extracted_text": extracted_text, "ai_analysis": ai_analysis,
"processing_status": status })
except json.JSONDecodeError as e:
print(f" Skipping invalid JSON line {line_num}: {e}")
error_count += 1
processed_results.append({ # Add error record
"url": f"Unknown (Line {line_num})", "extracted_text": None, "ai_analysis": None,
"processing_status": "Error: Invalid JSON in Raw Data" })
continue
except Exception as e:
print(f" Unexpected error processing line {line_num} ({url}): {e}")
error_count += 1
processed_results.append({
"url": url, "extracted_text": None, "ai_analysis": None,
"processing_status": f"Error: Unexpected Processing Failure ({type(e).__name__})" })
continue
except FileNotFoundError:
print(f"Error: Raw data file '{raw_data_input_filename}' not found. Cannot proceed.")
return []
except Exception as e:
print(f"An unexpected error occurred while reading raw data file: {e}")
return processed_results
print(f"\n--- Phase 2 Complete: Processed {processed_count} records with {error_count} errors. ---")
return processed_results
# --- Phase 3: Save Processed Results (No Changes Needed) ---
def save_processed_results(final_data, json_filename, csv_filename):
"""Saves the final processed data to JSON and CSV files."""
print("\n--- Starting Phase 3: Saving Processed Results ---")
if not final_data:
print("No processed data to save.")
return
all_keys = set()
for item in final_data: all_keys.update(item.keys())
fieldnames = sorted(list(all_keys))
# Save to JSON
try:
with open(json_filename, 'w', encoding='utf-8') as f_json:
json.dump(final_data, f_json, ensure_ascii=False, indent=4)
print(f"Successfully saved processed JSON results to: {json_filename}")
except Exception as e:
print(f"Error saving processed data to JSON file ({json_filename}): {e}")
# Save to CSV
try:
with open(csv_filename, 'w', newline='', encoding='utf-8') as f_csv:
writer = csv.DictWriter(f_csv, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(final_data)
print(f"Successfully saved processed CSV results to: {csv_filename}")
except Exception as e:
print(f"Error saving processed data to CSV file ({csv_filename}): {e}")
print("\n--- Phase 3 Complete ---")
# --- Main Execution ---
if __name__ == "__main__":
print("Starting Specific SHL URL Crawler and Analyzer...")
# # --- Phase 1 ---
# crawl_and_scrape_raw_specific(
# start_url=START_URL,
# max_pages=MAX_PAGES_TO_CRAWL,
# output_filename=RAW_DATA_FILENAME
# )
# --- Phase 2 ---
processed_data = process_raw_data_with_ai(RAW_DATA_FILENAME)
# --- Phase 3 ---
save_processed_results(
final_data=processed_data,
json_filename=PROCESSED_JSON_FILENAME,
csv_filename=PROCESSED_CSV_FILENAME
)
print("\nScript finished.")