Spaces:

akshit7093
/

SHL

Sleeping

App Files Files Community

SHL / test.py

joker7094

Initial commit with LFS-tracked index file

655c38a 11 months ago

raw

history blame contribute delete

13.4 kB

	import os
	import json
	import csv
	import time
	import re
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from collections import deque
	from dotenv import load_dotenv
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough

	# --- Configuration ---
	load_dotenv()
	google_api_key = os.getenv("GOOGLE_API_KEY")
	if not google_api_key:
	raise ValueError("GOOGLE_API_KEY not found in environment variables.")

	# --- Crawler Configuration ---
	START_URL = "https://www.shl.com/solutions/products/product-catalog/"
	BASE_CATALOG_URL_PREFIX = "https://www.shl.com/solutions/products/product-catalog/"
	# Define allowed URL patterns based on user input
	ALLOWED_PATTERNS = [
	re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"$"), # Exact base URL
	re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"\?start=\d+"), # Pagination URLs
	re.compile(r"^" + re.escape(BASE_CATALOG_URL_PREFIX) + r"view/"), # Detail View URLs
	]
	MAX_PAGES_TO_CRAWL = 200 # Safety limit (adjust as needed)
	RAW_DATA_FILENAME = "shl_raw_scraped_data_specific.jsonl" # New file for specific crawl

	# --- Processing & Output Configuration ---
	PROCESSED_JSON_FILENAME = "shl_processed_analysis_specific.json" # New file
	PROCESSED_CSV_FILENAME = "shl_processed_analysis_specific.csv" # New file
	POLITE_DELAY_SECONDS = 1
	REQUEST_TIMEOUT = 30
	MAX_LLM_CONTENT_CHARS = 15000
	HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

	# --- LLM Setup ---
	# Re-initialize LLM components (same as before)
	llm = ChatGoogleGenerativeAI(model="gemma-3-27b-it", google_api_key=google_api_key)
	prompt_template = ChatPromptTemplate.from_template(
	"""
	Based only on the following text content scraped from an SHL web page, please provide:
	1. A concise summary of the page's main topic (2-4 sentences).
	2. If it describes an assessment or product, list its key features, benefits, or what it measures (up to 5 bullet points). Otherwise, state "Not applicable".

	Do not add any information not present in the text. If the text is insufficient or irrelevant, state that.

	Scraped Content:
	{context}

	Analysis:
	"""
	)
	output_parser = StrOutputParser()
	llm_chain = {"context": RunnablePassthrough()} \| prompt_template \| llm \| output_parser

	# --- Helper Functions ---

	def get_soup(url):
	"""Fetches URL content and returns a BeautifulSoup object or None on error."""
	try:
	response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
	response.raise_for_status()
	content_type = response.headers.get('content-type', '').lower()
	if 'html' not in content_type:
	print(f" Skipping URL {url}: Non-HTML content type ({content_type})")
	return None, None
	html_content = response.content.decode('utf-8', errors='ignore')
	soup = BeautifulSoup(html_content, 'html.parser')
	return soup, response.text
	except requests.exceptions.Timeout:
	print(f" Timeout error fetching {url}")
	return None, None
	except requests.exceptions.RequestException as e:
	print(f" Request error fetching {url}: {e}")
	return None, None
	except Exception as e:
	print(f" Error processing {url}: {e}")
	return None, None

	def is_allowed_shl_url(url):
	"""Checks if the URL matches one of the defined allowed patterns."""
	# Simple check first
	if not url or not url.startswith(BASE_CATALOG_URL_PREFIX):
	return False
	# Check against regex patterns
	for pattern in ALLOWED_PATTERNS:
	if pattern.match(url):
	return True
	# print(f" Debug: URL rejected by patterns: {url}") # Optional debug
	return False

	def extract_text_from_html(html_content):
	"""Extracts and cleans text from raw HTML string."""
	if not html_content: return None
	try:
	soup = BeautifulSoup(html_content, 'html.parser')
	for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']): # Added form removal
	element.decompose()
	body = soup.body
	text = body.get_text(separator=' ', strip=True) if body else soup.get_text(separator=' ', strip=True)
	text = re.sub(r'\s+', ' ', text).strip()
	return text
	except Exception as e:
	print(f" Error extracting text from HTML snippet: {e}")
	return None

	# --- Phase 1: Crawl & Scrape Raw HTML (Specific URLs) ---

	def crawl_and_scrape_raw_specific(start_url, max_pages, output_filename):
	print(f"--- Starting Phase 1: Crawling Specific SHL URLs (Max: {max_pages} pages) ---")
	queue = deque([start_url])
	visited_urls = {start_url}
	pages_scraped = 0

	with open(output_filename, 'w') as f: # Clear/prepare output file
	f.write("")
	print(f"Cleared/Prepared raw data file: {output_filename}")

	while queue and pages_scraped < max_pages:
	current_url = queue.popleft()
	print(f"\nProcessing ({pages_scraped + 1}/{max_pages}): {current_url}")

	# Check BEFORE fetching if URL is allowed (it should be if it came from queue, but good practice)
	if not is_allowed_shl_url(current_url):
	print(f" Skipping non-allowed URL from queue (should not happen): {current_url}")
	continue

	soup, raw_html = get_soup(current_url)

	if raw_html:
	try:
	with open(output_filename, 'a', encoding='utf-8') as f_out:
	json.dump({"url": current_url, "raw_html": raw_html}, f_out)
	f_out.write('\n')
	pages_scraped += 1
	print(f" Successfully scraped and saved raw HTML ({len(raw_html)} bytes).")
	except Exception as e:
	print(f" Error saving raw data for {current_url}: {e}")

	if soup:
	links_found_on_page = 0
	new_links_added = 0
	for link in soup.find_all('a', href=True):
	href = link['href']
	absolute_url = urljoin(current_url, href)
	parsed_url = urlparse(absolute_url)
	normalized_url = parsed_url._replace(fragment="").geturl() # Remove fragment
	links_found_on_page += 1

	# * Crucial Change: Check against specific allowed patterns *
	if is_allowed_shl_url(normalized_url) and normalized_url not in visited_urls:
	visited_urls.add(normalized_url)
	queue.append(normalized_url)
	new_links_added += 1
	print(f" Inspected {links_found_on_page} links, added {new_links_added} new valid URLs to queue.")

	print(f" Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
	time.sleep(POLITE_DELAY_SECONDS)

	print(f"\n--- Phase 1 Complete: Scraped {pages_scraped} pages matching allowed patterns. ---")
	print(f"Raw data saved incrementally to {output_filename}")


	# --- Phase 2: Process Raw Data & AI Analysis (No Changes Needed) ---

	def process_raw_data_with_ai(raw_data_input_filename):
	print(f"\n--- Starting Phase 2: Processing Raw Data from {raw_data_input_filename} & AI Analysis ---")
	processed_results = []
	processed_count = 0
	error_count = 0

	try:
	with open(raw_data_input_filename, 'r', encoding='utf-8') as f_in:
	for line_num, line in enumerate(f_in, 1):
	processed_count += 1
	url = f"Unknown (Line {line_num})" # Default if parsing fails
	try:
	data = json.loads(line)
	url = data.get("url", f"Unknown (Line {line_num})")
	raw_html = data.get("raw_html")
	print(f"\nProcessing item {processed_count}: {url}")

	if not url or not raw_html:
	print(" Skipping: Missing URL or raw HTML in record.")
	error_count += 1
	processed_results.append({
	"url": url, "extracted_text": None, "ai_analysis": None,
	"processing_status": "Error: Invalid Raw Data Record" })
	continue

	print(" Extracting text from raw HTML...")
	extracted_text = extract_text_from_html(raw_html)
	ai_analysis = None
	if not extracted_text:
	print(" Failed to extract text.")
	status = "Error: Text Extraction Failed"
	else:
	print(f" Extracted ~{len(extracted_text)} characters. Sending to AI...")
	try:
	truncated_text = extracted_text
	if len(extracted_text) > MAX_LLM_CONTENT_CHARS:
	truncated_text = extracted_text[:MAX_LLM_CONTENT_CHARS] + "... (truncated)"
	print(f" Text truncated to {MAX_LLM_CONTENT_CHARS} chars for LLM.")

	ai_analysis = llm_chain.invoke(truncated_text)
	print(" AI analysis received.")
	status = "Success: Analyzed"
	print(f" Politely waiting for {POLITE_DELAY_SECONDS} second(s)...")
	time.sleep(POLITE_DELAY_SECONDS)

	except Exception as e:
	print(f" Error during AI analysis: {e}")
	ai_analysis = f"Error during AI analysis: {e}"
	status = f"Error: AI Failed ({type(e).__name__})"
	error_count += 1

	processed_results.append({
	"url": url, "extracted_text": extracted_text, "ai_analysis": ai_analysis,
	"processing_status": status })

	except json.JSONDecodeError as e:
	print(f" Skipping invalid JSON line {line_num}: {e}")
	error_count += 1
	processed_results.append({ # Add error record
	"url": f"Unknown (Line {line_num})", "extracted_text": None, "ai_analysis": None,
	"processing_status": "Error: Invalid JSON in Raw Data" })
	continue
	except Exception as e:
	print(f" Unexpected error processing line {line_num} ({url}): {e}")
	error_count += 1
	processed_results.append({
	"url": url, "extracted_text": None, "ai_analysis": None,
	"processing_status": f"Error: Unexpected Processing Failure ({type(e).__name__})" })
	continue

	except FileNotFoundError:
	print(f"Error: Raw data file '{raw_data_input_filename}' not found. Cannot proceed.")
	return []
	except Exception as e:
	print(f"An unexpected error occurred while reading raw data file: {e}")
	return processed_results

	print(f"\n--- Phase 2 Complete: Processed {processed_count} records with {error_count} errors. ---")
	return processed_results

	# --- Phase 3: Save Processed Results (No Changes Needed) ---

	def save_processed_results(final_data, json_filename, csv_filename):
	"""Saves the final processed data to JSON and CSV files."""
	print("\n--- Starting Phase 3: Saving Processed Results ---")
	if not final_data:
	print("No processed data to save.")
	return

	all_keys = set()
	for item in final_data: all_keys.update(item.keys())
	fieldnames = sorted(list(all_keys))

	# Save to JSON
	try:
	with open(json_filename, 'w', encoding='utf-8') as f_json:
	json.dump(final_data, f_json, ensure_ascii=False, indent=4)
	print(f"Successfully saved processed JSON results to: {json_filename}")
	except Exception as e:
	print(f"Error saving processed data to JSON file ({json_filename}): {e}")

	# Save to CSV
	try:
	with open(csv_filename, 'w', newline='', encoding='utf-8') as f_csv:
	writer = csv.DictWriter(f_csv, fieldnames=fieldnames, extrasaction='ignore')
	writer.writeheader()
	writer.writerows(final_data)
	print(f"Successfully saved processed CSV results to: {csv_filename}")
	except Exception as e:
	print(f"Error saving processed data to CSV file ({csv_filename}): {e}")

	print("\n--- Phase 3 Complete ---")


	# --- Main Execution ---
	if __name__ == "__main__":
	print("Starting Specific SHL URL Crawler and Analyzer...")

	# # --- Phase 1 ---
	# crawl_and_scrape_raw_specific(
	# start_url=START_URL,
	# max_pages=MAX_PAGES_TO_CRAWL,
	# output_filename=RAW_DATA_FILENAME
	# )

	# --- Phase 2 ---
	processed_data = process_raw_data_with_ai(RAW_DATA_FILENAME)

	# --- Phase 3 ---
	save_processed_results(
	final_data=processed_data,
	json_filename=PROCESSED_JSON_FILENAME,
	csv_filename=PROCESSED_CSV_FILENAME
	)

	print("\nScript finished.")