Spaces:
Running
Running
| """Context builder for extracting company information.""" | |
| import json | |
| import re | |
| from dataclasses import dataclass, asdict | |
| from typing import Optional | |
| import urllib.request | |
| import urllib.error | |
| from html.parser import HTMLParser | |
| class CompanyContext: | |
| """Structured company information extracted from URL.""" | |
| company_name: str | |
| industry: str | |
| description: str | |
| products_services: list[str] | |
| target_customers: str | |
| geographic_focus: str | |
| terminology: list[str] | |
| raw_content: str = "" | |
| def to_prompt(self) -> str: | |
| """Convert to a prompt-friendly string.""" | |
| return f"""Company: {self.company_name} | |
| Industry: {self.industry} | |
| Description: {self.description} | |
| Products/Services: {', '.join(self.products_services)} | |
| Target Customers: {self.target_customers} | |
| Geographic Focus: {self.geographic_focus} | |
| Key Terminology: {', '.join(self.terminology)}""" | |
| def to_dict(self) -> dict: | |
| """Convert to dictionary for caching.""" | |
| return asdict(self) | |
| def from_dict(cls, data: dict) -> 'CompanyContext': | |
| """Create from dictionary.""" | |
| return cls(**data) | |
| class SimpleHTMLTextExtractor(HTMLParser): | |
| """Extract text content from HTML.""" | |
| def __init__(self): | |
| super().__init__() | |
| self.text_parts = [] | |
| self.skip_tags = {'script', 'style', 'meta', 'link', 'noscript'} | |
| self.current_skip = False | |
| self.skip_depth = 0 | |
| def handle_starttag(self, tag, attrs): | |
| if tag.lower() in self.skip_tags: | |
| self.current_skip = True | |
| self.skip_depth += 1 | |
| def handle_endtag(self, tag): | |
| if tag.lower() in self.skip_tags and self.skip_depth > 0: | |
| self.skip_depth -= 1 | |
| if self.skip_depth == 0: | |
| self.current_skip = False | |
| def handle_data(self, data): | |
| if not self.current_skip: | |
| text = data.strip() | |
| if text: | |
| self.text_parts.append(text) | |
| def get_text(self) -> str: | |
| return ' '.join(self.text_parts) | |
| class ContextBuilder: | |
| """Builds company context from URL and use case.""" | |
| def __init__(self, anthropic_client=None, web_search_fn=None): | |
| """ | |
| Initialize context builder. | |
| Args: | |
| anthropic_client: Anthropic client for AI extraction | |
| web_search_fn: Function to search web (fallback if scraping fails) | |
| """ | |
| self.anthropic_client = anthropic_client | |
| self.web_search_fn = web_search_fn | |
| def build_context(self, url: str, use_case: str) -> CompanyContext: | |
| """ | |
| Build company context from URL and use case. | |
| Args: | |
| url: Company website URL | |
| use_case: The analytics use case (e.g., "Retail Analytics") | |
| Returns: | |
| CompanyContext with extracted information | |
| """ | |
| # Try to scrape the URL | |
| raw_content = self._scrape_url(url) | |
| # If scraping failed, try web search | |
| if not raw_content and self.web_search_fn: | |
| company_name = self._extract_company_from_url(url) | |
| raw_content = self.web_search_fn(f"{company_name} company about") | |
| if not raw_content: | |
| # Fallback: create minimal context from URL | |
| return self._create_minimal_context(url, use_case) | |
| # Use AI to extract structured context | |
| if self.anthropic_client: | |
| return self._extract_context_with_ai(raw_content, url, use_case) | |
| else: | |
| return self._create_minimal_context(url, use_case, raw_content) | |
| def _scrape_url(self, url: str) -> Optional[str]: | |
| """Scrape text content from URL.""" | |
| try: | |
| # Ensure URL has scheme | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| req = urllib.request.Request( | |
| url, | |
| headers={ | |
| 'User-Agent': 'Mozilla/5.0 (compatible; LegitData/1.0)' | |
| } | |
| ) | |
| with urllib.request.urlopen(req, timeout=10) as response: | |
| html = response.read().decode('utf-8', errors='ignore') | |
| # Extract text from HTML | |
| extractor = SimpleHTMLTextExtractor() | |
| extractor.feed(html) | |
| text = extractor.get_text() | |
| # Truncate to reasonable size | |
| return text[:10000] if text else None | |
| except (urllib.error.URLError, urllib.error.HTTPError, Exception) as e: | |
| print(f"Warning: Could not scrape {url}: {e}") | |
| return None | |
| def _extract_company_from_url(self, url: str) -> str: | |
| """Extract company name from URL.""" | |
| # Remove scheme and www | |
| cleaned = re.sub(r'^https?://(www\.)?', '', url) | |
| # Get domain | |
| domain = cleaned.split('/')[0] | |
| # Remove TLD | |
| company = domain.rsplit('.', 1)[0] | |
| # Clean up | |
| company = company.replace('-', ' ').replace('_', ' ') | |
| return company.title() | |
| def _extract_context_with_ai( | |
| self, | |
| raw_content: str, | |
| url: str, | |
| use_case: str | |
| ) -> CompanyContext: | |
| """Use AI to extract structured company context.""" | |
| prompt = f"""Analyze this website content and extract company information. | |
| Website URL: {url} | |
| Use Case: {use_case} | |
| Website Content: | |
| {raw_content[:8000]} | |
| Extract the following information and return as JSON: | |
| {{ | |
| "company_name": "Official company name", | |
| "industry": "Primary industry/vertical", | |
| "description": "Brief company description (1-2 sentences)", | |
| "products_services": ["List of main products or services"], | |
| "target_customers": "Who they sell to (B2B, B2C, enterprise, SMB, etc.)", | |
| "geographic_focus": "Where they operate (global, US, Europe, etc.)", | |
| "terminology": ["Industry-specific terms or jargon used"] | |
| }} | |
| Return ONLY valid JSON, no other text.""" | |
| try: | |
| response = self.anthropic_client.messages.create( | |
| model="claude-sonnet-4-20250514", | |
| max_tokens=1000, | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| # Parse JSON response | |
| json_str = response.content[0].text.strip() | |
| # Handle potential markdown code blocks | |
| if json_str.startswith('```'): | |
| json_str = re.sub(r'^```\w*\n?', '', json_str) | |
| json_str = re.sub(r'\n?```$', '', json_str) | |
| data = json.loads(json_str) | |
| return CompanyContext( | |
| company_name=data.get("company_name", self._extract_company_from_url(url)), | |
| industry=data.get("industry", "Unknown"), | |
| description=data.get("description", ""), | |
| products_services=data.get("products_services", []), | |
| target_customers=data.get("target_customers", "General"), | |
| geographic_focus=data.get("geographic_focus", "Global"), | |
| terminology=data.get("terminology", []), | |
| raw_content=raw_content[:2000] | |
| ) | |
| except Exception as e: | |
| print(f"Warning: AI extraction failed: {e}") | |
| return self._create_minimal_context(url, use_case, raw_content) | |
| def _create_minimal_context( | |
| self, | |
| url: str, | |
| use_case: str, | |
| raw_content: str = "" | |
| ) -> CompanyContext: | |
| """Create minimal context when extraction fails.""" | |
| company_name = self._extract_company_from_url(url) | |
| return CompanyContext( | |
| company_name=company_name, | |
| industry=self._infer_industry_from_use_case(use_case), | |
| description=f"{company_name} - {use_case}", | |
| products_services=[], | |
| target_customers="General", | |
| geographic_focus="Global", | |
| terminology=[], | |
| raw_content=raw_content[:2000] if raw_content else "" | |
| ) | |
| def _infer_industry_from_use_case(self, use_case: str) -> str: | |
| """Infer industry from use case.""" | |
| use_case_lower = use_case.lower() | |
| if 'retail' in use_case_lower: | |
| return "Retail" | |
| elif 'supply' in use_case_lower: | |
| return "Supply Chain / Logistics" | |
| elif 'financial' in use_case_lower: | |
| return "Financial Services" | |
| elif 'marketing' in use_case_lower: | |
| return "Marketing / Advertising" | |
| elif 'sales' in use_case_lower: | |
| return "Sales / Commerce" | |
| elif 'customer' in use_case_lower: | |
| return "Customer Service" | |
| else: | |
| return "General Business" | |