Spaces:

Agents-MCP-Hackathon
/

Founder_Name_Extraction_v2

Sleeping

App Files Files Community

Founder_Name_Extraction_v2 / app.py

dygoo

Update app.py

7c6b357 verified 9 months ago

raw

history blame contribute delete

6.9 kB

	import gradio as gr
	import requests
	import re
	from duckduckgo_search import DDGS
	import anthropic
	import os
	import json

	# Initialize clients
	anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

	# === 1. Simplified Search Workflow ===

	def search_workflow(name: str, progress=gr.Progress()):
	"""
	A simple function to search for articles, fetching exactly 8 news articles: 4 recent, 4 historical.
	"""
	if not name or not name.strip():
	return "❌ Please enter a company name.", ""

	progress(0, desc="Starting search...")

	# Define search queries
	recent_keywords = f'"{name}" founder news'
	historical_keywords = f'"{name}" founder history origin'

	all_articles_markdown = []
	raw_text_for_ai = ""

	try:
	with DDGS(timeout=20) as ddgs:
	# --- Fetch 4 Recent Articles (past year) ---
	progress(0.1, desc="Searching for recent articles...")
	# The 'timelimit="y"' parameter is a reliable way to get recent results.
	recent_results = ddgs.text(keywords=recent_keywords, max_results=4, timelimit='y') or []

	for i, res in enumerate(recent_results):
	title = res.get('title', 'No Title')
	url = res.get('href', '#')
	body = res.get('body', 'No snippet available.')

	# Format for display
	markdown = f"### (Recent) {title}\nSource: [{url}]({url})\n\n{body}\n"
	all_articles_markdown.append(markdown)

	# Format for AI
	raw_text_for_ai += f"Article (Recent):\nTitle: {title}\nContent: {body}\n\n"

	# --- Fetch 4 Historical Articles ---
	progress(0.5, desc="Searching for historical articles...")
	historical_results = ddgs.text(keywords=historical_keywords, max_results=4) or []

	for i, res in enumerate(historical_results):
	title = res.get('title', 'No Title')
	url = res.get('href', '#')
	body = res.get('body', 'No snippet available.')

	# Format for display
	markdown = f"### (Historical) {title}\nSource: [{url}]({url})\n\n{body}\n"
	all_articles_markdown.append(markdown)

	# Format for AI
	raw_text_for_ai += f"Article (Historical):\nTitle: {title}\nContent: {body}\n\n"

	except Exception as e:
	return f"❌ An error occurred during search: {e}", ""

	if not all_articles_markdown:
	return "[INFO] No articles found for that company.", ""

	progress(1.0, desc="Search complete!")

	final_markdown = f"## Found {len(all_articles_markdown)} Articles\n\n" + "\n---\n".join(all_articles_markdown)

	return final_markdown, raw_text_for_ai


	# === 2. Simplified Extraction Workflow ===

	def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
	"""
	A simple and robust tool to extract founders from text using the AI model.
	"""
	if not raw_text or not raw_text.strip():
	return "❌ Please run a search first to get text to analyze."

	progress(0, desc="Preparing prompt for AI...")

	prompt = f"""From the provided article snippets about "{company_name}", extract the names of individuals explicitly identified as a founder.
	Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote or context."}}]}}
	If no founders are mentioned, return an empty list: {{"founders": []}}.
	Do not add any text outside the JSON object.

	ARTICLES:
	---
	{raw_text[:20000]}
	---
	"""
	try:
	progress(0.5, desc="Sending request to AI model...")
	message = anthropic_client.messages.create(
	model="claude-sonnet-4-20250514", # As requested
	max_tokens=1024,
	temperature=0.0,
	messages=[{"role": "user", "content": prompt}]
	)

	# This robust check prevents the 'list index out of range' error.
	if message and message.content and isinstance(message.content, list) and len(message.content) > 0:
	text_block = message.content[0]
	if hasattr(text_block, 'text'):
	json_text = text_block.text

	# Clean the response to find the JSON object
	match = re.search(r'\{.*\}', json_text, re.DOTALL)
	if match:
	clean_json = match.group(0)
	try:
	parsed_json = json.loads(clean_json)
	formatted_json = json.dumps(parsed_json, indent=2)
	progress(1.0, desc="Extraction complete!")
	return f"```json\n{formatted_json}\n```"
	except json.JSONDecodeError:
	return f"⚠️ AI Warning: The model returned malformed JSON.\n\n{clean_json}"
	else:
	return f"⚠️ AI Warning: The model did not return a JSON object.\n\n{json_text}"

	return "❌ API Error: The AI model returned an empty or invalid response."

	except Exception as e:
	return f"❌ An unexpected error occurred during extraction: {e}"


	# === 3. Simplified Gradio UI ===

	with gr.Blocks(title="Founder Name Extraction Tool", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🔎 Founder Name Extraction")
	gr.Markdown("A tool to find company founders. Step 1: Search for articles. Step 2: Extract founders' names from the results.")

	# Hidden state to pass text from search to extraction
	search_results_for_ai = gr.State("")

	with gr.Row():
	name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'", scale=3)
	search_btn = gr.Button("1. 🔍 Search for Articles", variant="primary", scale=1)

	with gr.Row():
	extract_btn = gr.Button("2. 📊 Extract Founders from Search Results", variant="secondary")

	with gr.Tab("Search Results"):
	output_search = gr.Markdown()
	with gr.Tab("Founder Intelligence Report"):
	output_extract = gr.Markdown()

	# --- Event Wiring ---

	# Search button populates the search results tab and the hidden state
	search_btn.click(
	fn=search_workflow,
	inputs=[name_input],
	outputs=[output_search, search_results_for_ai],
	show_progress="full"
	)

	# Extract button uses the hidden state to populate the extraction tab
	extract_btn.click(
	fn=extraction_workflow,
	inputs=[search_results_for_ai, name_input],
	outputs=[output_extract],
	show_progress="full"
	)



	demo.queue()

	if __name__ == "__main__":
	demo.launch(show_error=True)