demoprep / legitdata_project /legitdata /analyzer /context_builder.py
mikeboone's picture
feat: vendor legitdata source package into repository
1502291
"""Context builder for extracting company information."""
import json
import re
from dataclasses import dataclass, asdict
from typing import Optional
import urllib.request
import urllib.error
from html.parser import HTMLParser
@dataclass
class CompanyContext:
"""Structured company information extracted from URL."""
company_name: str
industry: str
description: str
products_services: list[str]
target_customers: str
geographic_focus: str
terminology: list[str]
raw_content: str = ""
def to_prompt(self) -> str:
"""Convert to a prompt-friendly string."""
return f"""Company: {self.company_name}
Industry: {self.industry}
Description: {self.description}
Products/Services: {', '.join(self.products_services)}
Target Customers: {self.target_customers}
Geographic Focus: {self.geographic_focus}
Key Terminology: {', '.join(self.terminology)}"""
def to_dict(self) -> dict:
"""Convert to dictionary for caching."""
return asdict(self)
@classmethod
def from_dict(cls, data: dict) -> 'CompanyContext':
"""Create from dictionary."""
return cls(**data)
class SimpleHTMLTextExtractor(HTMLParser):
"""Extract text content from HTML."""
def __init__(self):
super().__init__()
self.text_parts = []
self.skip_tags = {'script', 'style', 'meta', 'link', 'noscript'}
self.current_skip = False
self.skip_depth = 0
def handle_starttag(self, tag, attrs):
if tag.lower() in self.skip_tags:
self.current_skip = True
self.skip_depth += 1
def handle_endtag(self, tag):
if tag.lower() in self.skip_tags and self.skip_depth > 0:
self.skip_depth -= 1
if self.skip_depth == 0:
self.current_skip = False
def handle_data(self, data):
if not self.current_skip:
text = data.strip()
if text:
self.text_parts.append(text)
def get_text(self) -> str:
return ' '.join(self.text_parts)
class ContextBuilder:
"""Builds company context from URL and use case."""
def __init__(self, anthropic_client=None, web_search_fn=None):
"""
Initialize context builder.
Args:
anthropic_client: Anthropic client for AI extraction
web_search_fn: Function to search web (fallback if scraping fails)
"""
self.anthropic_client = anthropic_client
self.web_search_fn = web_search_fn
def build_context(self, url: str, use_case: str) -> CompanyContext:
"""
Build company context from URL and use case.
Args:
url: Company website URL
use_case: The analytics use case (e.g., "Retail Analytics")
Returns:
CompanyContext with extracted information
"""
# Try to scrape the URL
raw_content = self._scrape_url(url)
# If scraping failed, try web search
if not raw_content and self.web_search_fn:
company_name = self._extract_company_from_url(url)
raw_content = self.web_search_fn(f"{company_name} company about")
if not raw_content:
# Fallback: create minimal context from URL
return self._create_minimal_context(url, use_case)
# Use AI to extract structured context
if self.anthropic_client:
return self._extract_context_with_ai(raw_content, url, use_case)
else:
return self._create_minimal_context(url, use_case, raw_content)
def _scrape_url(self, url: str) -> Optional[str]:
"""Scrape text content from URL."""
try:
# Ensure URL has scheme
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (compatible; LegitData/1.0)'
}
)
with urllib.request.urlopen(req, timeout=10) as response:
html = response.read().decode('utf-8', errors='ignore')
# Extract text from HTML
extractor = SimpleHTMLTextExtractor()
extractor.feed(html)
text = extractor.get_text()
# Truncate to reasonable size
return text[:10000] if text else None
except (urllib.error.URLError, urllib.error.HTTPError, Exception) as e:
print(f"Warning: Could not scrape {url}: {e}")
return None
def _extract_company_from_url(self, url: str) -> str:
"""Extract company name from URL."""
# Remove scheme and www
cleaned = re.sub(r'^https?://(www\.)?', '', url)
# Get domain
domain = cleaned.split('/')[0]
# Remove TLD
company = domain.rsplit('.', 1)[0]
# Clean up
company = company.replace('-', ' ').replace('_', ' ')
return company.title()
def _extract_context_with_ai(
self,
raw_content: str,
url: str,
use_case: str
) -> CompanyContext:
"""Use AI to extract structured company context."""
prompt = f"""Analyze this website content and extract company information.
Website URL: {url}
Use Case: {use_case}
Website Content:
{raw_content[:8000]}
Extract the following information and return as JSON:
{{
"company_name": "Official company name",
"industry": "Primary industry/vertical",
"description": "Brief company description (1-2 sentences)",
"products_services": ["List of main products or services"],
"target_customers": "Who they sell to (B2B, B2C, enterprise, SMB, etc.)",
"geographic_focus": "Where they operate (global, US, Europe, etc.)",
"terminology": ["Industry-specific terms or jargon used"]
}}
Return ONLY valid JSON, no other text."""
try:
response = self.anthropic_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
messages=[{"role": "user", "content": prompt}]
)
# Parse JSON response
json_str = response.content[0].text.strip()
# Handle potential markdown code blocks
if json_str.startswith('```'):
json_str = re.sub(r'^```\w*\n?', '', json_str)
json_str = re.sub(r'\n?```$', '', json_str)
data = json.loads(json_str)
return CompanyContext(
company_name=data.get("company_name", self._extract_company_from_url(url)),
industry=data.get("industry", "Unknown"),
description=data.get("description", ""),
products_services=data.get("products_services", []),
target_customers=data.get("target_customers", "General"),
geographic_focus=data.get("geographic_focus", "Global"),
terminology=data.get("terminology", []),
raw_content=raw_content[:2000]
)
except Exception as e:
print(f"Warning: AI extraction failed: {e}")
return self._create_minimal_context(url, use_case, raw_content)
def _create_minimal_context(
self,
url: str,
use_case: str,
raw_content: str = ""
) -> CompanyContext:
"""Create minimal context when extraction fails."""
company_name = self._extract_company_from_url(url)
return CompanyContext(
company_name=company_name,
industry=self._infer_industry_from_use_case(use_case),
description=f"{company_name} - {use_case}",
products_services=[],
target_customers="General",
geographic_focus="Global",
terminology=[],
raw_content=raw_content[:2000] if raw_content else ""
)
def _infer_industry_from_use_case(self, use_case: str) -> str:
"""Infer industry from use case."""
use_case_lower = use_case.lower()
if 'retail' in use_case_lower:
return "Retail"
elif 'supply' in use_case_lower:
return "Supply Chain / Logistics"
elif 'financial' in use_case_lower:
return "Financial Services"
elif 'marketing' in use_case_lower:
return "Marketing / Advertising"
elif 'sales' in use_case_lower:
return "Sales / Commerce"
elif 'customer' in use_case_lower:
return "Customer Service"
else:
return "General Business"