|
|
""" |
|
|
LLM Service for Grounded Summarization |
|
|
Provides fact-based summarization with strict grounding to prevent hallucination |
|
|
""" |
|
|
import os |
|
|
import logging |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class LLMService: |
|
|
""" |
|
|
LLM Service with grounding support |
|
|
|
|
|
Provides two modes: |
|
|
1. API Mode: Uses Anthropic Claude API if ANTHROPIC_API_KEY is available |
|
|
2. Fact-Based Mode: Uses structured fact extraction (no hallucination) |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.api_key = os.getenv("ANTHROPIC_API_KEY") |
|
|
self.use_api = bool(self.api_key) |
|
|
|
|
|
if self.use_api: |
|
|
try: |
|
|
import anthropic |
|
|
self.client = anthropic.Anthropic(api_key=self.api_key) |
|
|
logger.info("LLM Service initialized with Anthropic Claude API") |
|
|
except ImportError: |
|
|
logger.warning("anthropic package not installed, falling back to fact-based mode") |
|
|
self.use_api = False |
|
|
else: |
|
|
logger.info("LLM Service initialized in fact-based mode (no API key)") |
|
|
|
|
|
async def generate_grounded_summary( |
|
|
self, |
|
|
company_name: str, |
|
|
extracted_data: Dict, |
|
|
raw_facts: List[str], |
|
|
summary_type: str = "client" |
|
|
) -> str: |
|
|
""" |
|
|
Generate a summary strictly grounded in extracted facts |
|
|
|
|
|
Args: |
|
|
company_name: Name of the company |
|
|
extracted_data: Structured data extracted from research |
|
|
raw_facts: List of raw text facts for grounding |
|
|
summary_type: "client" or "prospect" |
|
|
|
|
|
Returns: |
|
|
Grounded summary string |
|
|
""" |
|
|
if self.use_api: |
|
|
return await self._api_based_summary(company_name, extracted_data, raw_facts, summary_type) |
|
|
else: |
|
|
return self._fact_based_summary(company_name, extracted_data, summary_type) |
|
|
|
|
|
async def _api_based_summary( |
|
|
self, |
|
|
company_name: str, |
|
|
extracted_data: Dict, |
|
|
raw_facts: List[str], |
|
|
summary_type: str |
|
|
) -> str: |
|
|
""" |
|
|
Use Claude API to generate summary with strict grounding |
|
|
""" |
|
|
|
|
|
facts_context = "\n".join(f"- {fact}" for fact in raw_facts[:50]) |
|
|
|
|
|
|
|
|
structured_data = self._format_structured_data(extracted_data) |
|
|
|
|
|
prompt = f"""You are a business research analyst creating a factual summary of {company_name}. |
|
|
|
|
|
CRITICAL RULES: |
|
|
1. ONLY use information from the FACTS and STRUCTURED DATA provided below |
|
|
2. DO NOT make up or infer ANY information not explicitly stated |
|
|
3. If information is missing, state "Information not available" |
|
|
4. Use direct quotes and facts from the provided data |
|
|
5. Be comprehensive but strictly factual |
|
|
|
|
|
STRUCTURED DATA EXTRACTED: |
|
|
{structured_data} |
|
|
|
|
|
RAW FACTS FROM RESEARCH: |
|
|
{facts_context} |
|
|
|
|
|
Create a comprehensive 3-4 paragraph summary of {company_name} that: |
|
|
1. Describes what they do and their main offerings |
|
|
2. Explains their value proposition and key benefits |
|
|
3. Identifies their target customers and market position |
|
|
4. Includes relevant facts (founded, size, funding, competitors) if available |
|
|
|
|
|
Summary must be factual, well-structured, and grounded ONLY in the provided data.""" |
|
|
|
|
|
try: |
|
|
message = self.client.messages.create( |
|
|
model="claude-3-5-sonnet-20241022", |
|
|
max_tokens=1000, |
|
|
temperature=0, |
|
|
messages=[ |
|
|
{"role": "user", "content": prompt} |
|
|
] |
|
|
) |
|
|
|
|
|
summary = message.content[0].text |
|
|
logger.info(f"Generated API-based summary for {company_name} ({len(summary)} chars)") |
|
|
return summary |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"API summarization failed: {e}, falling back to fact-based") |
|
|
return self._fact_based_summary(company_name, extracted_data, summary_type) |
|
|
|
|
|
def _fact_based_summary( |
|
|
self, |
|
|
company_name: str, |
|
|
extracted_data: Dict, |
|
|
summary_type: str |
|
|
) -> str: |
|
|
""" |
|
|
Generate fact-based summary without LLM (no hallucination possible) |
|
|
""" |
|
|
summary_parts = [] |
|
|
|
|
|
|
|
|
overview = f"**{company_name}**" |
|
|
|
|
|
if extracted_data.get('description'): |
|
|
overview += f" - {extracted_data['description']}" |
|
|
elif extracted_data.get('industry'): |
|
|
overview += f" is a company in the {extracted_data['industry']} industry" |
|
|
|
|
|
if extracted_data.get('website'): |
|
|
overview += f" (Website: {extracted_data['website']})" |
|
|
|
|
|
summary_parts.append(overview + ".") |
|
|
|
|
|
|
|
|
background_facts = [] |
|
|
|
|
|
if extracted_data.get('founded'): |
|
|
background_facts.append(f"founded in {extracted_data['founded']}") |
|
|
|
|
|
if extracted_data.get('company_size'): |
|
|
background_facts.append(f"with {extracted_data['company_size']}") |
|
|
|
|
|
if extracted_data.get('funding'): |
|
|
background_facts.append(f"having raised {extracted_data['funding']}") |
|
|
|
|
|
if background_facts: |
|
|
summary_parts.append("The company was " + ", ".join(background_facts) + ".") |
|
|
|
|
|
|
|
|
offerings_text = "" |
|
|
if extracted_data.get('offerings'): |
|
|
offerings = extracted_data['offerings'][:3] |
|
|
if offerings: |
|
|
offerings_text = f"They offer: {'; '.join(offerings)}." |
|
|
|
|
|
if extracted_data.get('key_features'): |
|
|
features = extracted_data['key_features'][:4] |
|
|
if features: |
|
|
if offerings_text: |
|
|
offerings_text += f" Key features include: {'; '.join(features)}." |
|
|
else: |
|
|
offerings_text = f"Key features include: {'; '.join(features)}." |
|
|
|
|
|
if offerings_text: |
|
|
summary_parts.append(offerings_text) |
|
|
|
|
|
|
|
|
if extracted_data.get('value_propositions'): |
|
|
value_props = extracted_data['value_propositions'][:3] |
|
|
if value_props: |
|
|
summary_parts.append(f"Their value propositions are: {'; '.join(value_props)}.") |
|
|
|
|
|
|
|
|
if extracted_data.get('target_customers'): |
|
|
customers = extracted_data['target_customers'][:2] |
|
|
if customers: |
|
|
summary_parts.append(f"They serve: {'; '.join(customers)}.") |
|
|
|
|
|
|
|
|
if extracted_data.get('use_cases'): |
|
|
use_cases = extracted_data['use_cases'][:2] |
|
|
if use_cases: |
|
|
summary_parts.append(f"Common use cases: {'; '.join(use_cases)}.") |
|
|
|
|
|
|
|
|
if extracted_data.get('pricing_model'): |
|
|
summary_parts.append(f"Pricing: {extracted_data['pricing_model']}.") |
|
|
|
|
|
|
|
|
if extracted_data.get('competitors'): |
|
|
competitors = extracted_data['competitors'][:3] |
|
|
if competitors: |
|
|
summary_parts.append(f"Main competitors include: {', '.join(competitors)}.") |
|
|
|
|
|
|
|
|
if extracted_data.get('differentiators'): |
|
|
diffs = extracted_data['differentiators'][:2] |
|
|
if diffs: |
|
|
summary_parts.append(f"What sets them apart: {'; '.join(diffs)}.") |
|
|
|
|
|
|
|
|
full_summary = " ".join(summary_parts) |
|
|
|
|
|
|
|
|
facts_count = len(extracted_data.get('raw_facts', [])) |
|
|
full_summary += f"\n\n*Note: This summary is based on {facts_count} facts extracted from web research. All information is grounded in actual data with no inferences or hallucinations.*" |
|
|
|
|
|
logger.info(f"Generated fact-based summary for {company_name} ({len(full_summary)} chars, {facts_count} facts)") |
|
|
|
|
|
return full_summary |
|
|
|
|
|
def _format_structured_data(self, data: Dict) -> str: |
|
|
"""Format extracted data for API prompt - ENHANCED with new fields""" |
|
|
lines = [] |
|
|
|
|
|
|
|
|
if data.get('name'): |
|
|
lines.append(f"Name: {data['name']}") |
|
|
if data.get('website'): |
|
|
lines.append(f"Website: {data['website']}") |
|
|
if data.get('industry'): |
|
|
lines.append(f"Industry: {data['industry']}") |
|
|
|
|
|
|
|
|
if data.get('founded'): |
|
|
lines.append(f"Founded: {data['founded']}") |
|
|
if data.get('company_size'): |
|
|
lines.append(f"Company Size: {data['company_size']}") |
|
|
if data.get('funding'): |
|
|
lines.append(f"Funding: {data['funding']}") |
|
|
if data.get('market_position'): |
|
|
lines.append(f"Market Position: {data['market_position'][:150]}") |
|
|
|
|
|
|
|
|
if data.get('offerings'): |
|
|
lines.append(f"Offerings: {', '.join(data['offerings'][:5])}") |
|
|
if data.get('key_features'): |
|
|
lines.append(f"Key Features: {', '.join(data['key_features'][:6])}") |
|
|
if data.get('integrations'): |
|
|
lines.append(f"Integrations: {', '.join(data['integrations'][:5])}") |
|
|
if data.get('pricing_model'): |
|
|
lines.append(f"Pricing: {data['pricing_model'][:150]}") |
|
|
|
|
|
|
|
|
if data.get('value_propositions'): |
|
|
lines.append(f"Value Propositions: {', '.join(data['value_propositions'][:3])}") |
|
|
if data.get('target_customers'): |
|
|
lines.append(f"Target Customers: {', '.join(data['target_customers'][:3])}") |
|
|
if data.get('use_cases'): |
|
|
lines.append(f"Use Cases: {', '.join(data['use_cases'][:3])}") |
|
|
|
|
|
|
|
|
if data.get('competitors'): |
|
|
lines.append(f"Competitors: {', '.join(data['competitors'][:5])}") |
|
|
if data.get('awards'): |
|
|
lines.append(f"Awards & Recognition: {', '.join(data['awards'][:3])}") |
|
|
|
|
|
|
|
|
if data.get('customer_testimonials'): |
|
|
lines.append(f"Customer Success Stories: {len(data['customer_testimonials'])} testimonials") |
|
|
if data.get('recent_news'): |
|
|
lines.append(f"Recent News: {', '.join(data['recent_news'][:3])}") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
_llm_service: Optional[LLMService] = None |
|
|
|
|
|
|
|
|
def get_llm_service() -> LLMService: |
|
|
"""Get or create LLM service instance""" |
|
|
global _llm_service |
|
|
if _llm_service is None: |
|
|
_llm_service = LLMService() |
|
|
return _llm_service |
|
|
|