Spaces:

thoughtspot-dp
/

demoprep

Running

App Files Files Community

demoprep / legitdata_project /legitdata /analyzer /context_builder.py

mikeboone

feat: vendor legitdata source package into repository

1502291 about 1 month ago

raw

history blame contribute delete

8.85 kB

	"""Context builder for extracting company information."""

	import json
	import re
	from dataclasses import dataclass, asdict
	from typing import Optional
	import urllib.request
	import urllib.error
	from html.parser import HTMLParser


	@dataclass
	class CompanyContext:
	"""Structured company information extracted from URL."""
	company_name: str
	industry: str
	description: str
	products_services: list[str]
	target_customers: str
	geographic_focus: str
	terminology: list[str]
	raw_content: str = ""

	def to_prompt(self) -> str:
	"""Convert to a prompt-friendly string."""
	return f"""Company: {self.company_name}
	Industry: {self.industry}
	Description: {self.description}
	Products/Services: {', '.join(self.products_services)}
	Target Customers: {self.target_customers}
	Geographic Focus: {self.geographic_focus}
	Key Terminology: {', '.join(self.terminology)}"""

	def to_dict(self) -> dict:
	"""Convert to dictionary for caching."""
	return asdict(self)

	@classmethod
	def from_dict(cls, data: dict) -> 'CompanyContext':
	"""Create from dictionary."""
	return cls(**data)


	class SimpleHTMLTextExtractor(HTMLParser):
	"""Extract text content from HTML."""

	def __init__(self):
	super().__init__()
	self.text_parts = []
	self.skip_tags = {'script', 'style', 'meta', 'link', 'noscript'}
	self.current_skip = False
	self.skip_depth = 0

	def handle_starttag(self, tag, attrs):
	if tag.lower() in self.skip_tags:
	self.current_skip = True
	self.skip_depth += 1

	def handle_endtag(self, tag):
	if tag.lower() in self.skip_tags and self.skip_depth > 0:
	self.skip_depth -= 1
	if self.skip_depth == 0:
	self.current_skip = False

	def handle_data(self, data):
	if not self.current_skip:
	text = data.strip()
	if text:
	self.text_parts.append(text)

	def get_text(self) -> str:
	return ' '.join(self.text_parts)


	class ContextBuilder:
	"""Builds company context from URL and use case."""

	def __init__(self, anthropic_client=None, web_search_fn=None):
	"""
	Initialize context builder.

	Args:
	anthropic_client: Anthropic client for AI extraction
	web_search_fn: Function to search web (fallback if scraping fails)
	"""
	self.anthropic_client = anthropic_client
	self.web_search_fn = web_search_fn

	def build_context(self, url: str, use_case: str) -> CompanyContext:
	"""
	Build company context from URL and use case.

	Args:
	url: Company website URL
	use_case: The analytics use case (e.g., "Retail Analytics")

	Returns:
	CompanyContext with extracted information
	"""
	# Try to scrape the URL
	raw_content = self._scrape_url(url)

	# If scraping failed, try web search
	if not raw_content and self.web_search_fn:
	company_name = self._extract_company_from_url(url)
	raw_content = self.web_search_fn(f"{company_name} company about")

	if not raw_content:
	# Fallback: create minimal context from URL
	return self._create_minimal_context(url, use_case)

	# Use AI to extract structured context
	if self.anthropic_client:
	return self._extract_context_with_ai(raw_content, url, use_case)
	else:
	return self._create_minimal_context(url, use_case, raw_content)

	def _scrape_url(self, url: str) -> Optional[str]:
	"""Scrape text content from URL."""
	try:
	# Ensure URL has scheme
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	req = urllib.request.Request(
	url,
	headers={
	'User-Agent': 'Mozilla/5.0 (compatible; LegitData/1.0)'
	}
	)

	with urllib.request.urlopen(req, timeout=10) as response:
	html = response.read().decode('utf-8', errors='ignore')

	# Extract text from HTML
	extractor = SimpleHTMLTextExtractor()
	extractor.feed(html)
	text = extractor.get_text()

	# Truncate to reasonable size
	return text[:10000] if text else None

	except (urllib.error.URLError, urllib.error.HTTPError, Exception) as e:
	print(f"Warning: Could not scrape {url}: {e}")
	return None

	def _extract_company_from_url(self, url: str) -> str:
	"""Extract company name from URL."""
	# Remove scheme and www
	cleaned = re.sub(r'^https?://(www\.)?', '', url)
	# Get domain
	domain = cleaned.split('/')[0]
	# Remove TLD
	company = domain.rsplit('.', 1)[0]
	# Clean up
	company = company.replace('-', ' ').replace('_', ' ')
	return company.title()

	def _extract_context_with_ai(
	self,
	raw_content: str,
	url: str,
	use_case: str
	) -> CompanyContext:
	"""Use AI to extract structured company context."""
	prompt = f"""Analyze this website content and extract company information.

	Website URL: {url}
	Use Case: {use_case}

	Website Content:
	{raw_content[:8000]}

	Extract the following information and return as JSON:
	{{
	"company_name": "Official company name",
	"industry": "Primary industry/vertical",
	"description": "Brief company description (1-2 sentences)",
	"products_services": ["List of main products or services"],
	"target_customers": "Who they sell to (B2B, B2C, enterprise, SMB, etc.)",
	"geographic_focus": "Where they operate (global, US, Europe, etc.)",
	"terminology": ["Industry-specific terms or jargon used"]
	}}

	Return ONLY valid JSON, no other text."""

	try:
	response = self.anthropic_client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=1000,
	messages=[{"role": "user", "content": prompt}]
	)

	# Parse JSON response
	json_str = response.content[0].text.strip()
	# Handle potential markdown code blocks
	if json_str.startswith('```'):
	json_str = re.sub(r'^```\w*\n?', '', json_str)
	json_str = re.sub(r'\n?```$', '', json_str)

	data = json.loads(json_str)

	return CompanyContext(
	company_name=data.get("company_name", self._extract_company_from_url(url)),
	industry=data.get("industry", "Unknown"),
	description=data.get("description", ""),
	products_services=data.get("products_services", []),
	target_customers=data.get("target_customers", "General"),
	geographic_focus=data.get("geographic_focus", "Global"),
	terminology=data.get("terminology", []),
	raw_content=raw_content[:2000]
	)

	except Exception as e:
	print(f"Warning: AI extraction failed: {e}")
	return self._create_minimal_context(url, use_case, raw_content)

	def _create_minimal_context(
	self,
	url: str,
	use_case: str,
	raw_content: str = ""
	) -> CompanyContext:
	"""Create minimal context when extraction fails."""
	company_name = self._extract_company_from_url(url)

	return CompanyContext(
	company_name=company_name,
	industry=self._infer_industry_from_use_case(use_case),
	description=f"{company_name} - {use_case}",
	products_services=[],
	target_customers="General",
	geographic_focus="Global",
	terminology=[],
	raw_content=raw_content[:2000] if raw_content else ""
	)

	def _infer_industry_from_use_case(self, use_case: str) -> str:
	"""Infer industry from use case."""
	use_case_lower = use_case.lower()

	if 'retail' in use_case_lower:
	return "Retail"
	elif 'supply' in use_case_lower:
	return "Supply Chain / Logistics"
	elif 'financial' in use_case_lower:
	return "Financial Services"
	elif 'marketing' in use_case_lower:
	return "Marketing / Advertising"
	elif 'sales' in use_case_lower:
	return "Sales / Commerce"
	elif 'customer' in use_case_lower:
	return "Customer Service"
	else:
	return "General Business"