|
|
import json |
|
|
import requests |
|
|
import os |
|
|
import FindEmailWorkFlowV2 |
|
|
import SendEmailWorkFlowV2 |
|
|
import setup |
|
|
from llm_client import get_client |
|
|
from pdf_utils import extract_text_from_pdf |
|
|
|
|
|
|
|
|
HUNTER_API_KEY = setup.HUNTER_API_KEY |
|
|
|
|
|
|
|
|
def load_legacy_excel_emails(excel_path="Workflow Company Log.xlsx"): |
|
|
""" |
|
|
Load email addresses from the legacy Excel file as an initial database. |
|
|
This is a READ-ONLY operation - the Excel file is never modified. |
|
|
All new emails are stored in the SQLite database only. |
|
|
|
|
|
Args: |
|
|
excel_path: Path to the Excel file containing previously sent emails |
|
|
(default: "Workflow Company Log.xlsx") |
|
|
|
|
|
Returns: |
|
|
tuple: (set of email addresses, set of domains) |
|
|
""" |
|
|
emails = set() |
|
|
domains = set() |
|
|
|
|
|
|
|
|
if not os.path.exists(excel_path): |
|
|
print(f"Legacy Excel file not found at '{excel_path}'. Skipping Excel import.") |
|
|
return emails, domains |
|
|
|
|
|
try: |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
df = pd.read_excel(excel_path, header=None) |
|
|
|
|
|
|
|
|
for value in df.iloc[:, 0].dropna(): |
|
|
email = str(value).strip() |
|
|
|
|
|
|
|
|
if '@' in email and '.' in email: |
|
|
emails.add(email) |
|
|
|
|
|
|
|
|
domain = email.split('@')[1] |
|
|
domains.add(domain) |
|
|
|
|
|
print(f"Loaded {len(emails)} emails from legacy Excel file ({len(domains)} unique domains)") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error reading Excel file '{excel_path}': {str(e)}") |
|
|
print("Continuing without legacy Excel data...") |
|
|
|
|
|
return emails, domains |
|
|
|
|
|
def askClaudeToFindCompanies(api_key=None, location="Atlanta", industry="Clean Tech", num_companies=5): |
|
|
""" |
|
|
Uses OpenRouter API to find startup companies based on location and industry. |
|
|
Returns only company names and domains (no emails or contacts). |
|
|
|
|
|
Args: |
|
|
api_key: Deprecated, kept for backwards compatibility |
|
|
location: City or region to search for companies (default: "Atlanta") |
|
|
industry: Industry type to target (default: "Clean Tech") |
|
|
num_companies: Number of companies to find (default: 5) |
|
|
|
|
|
Returns: |
|
|
list: List of dicts with keys: company_name, domain |
|
|
""" |
|
|
client = get_client() |
|
|
|
|
|
|
|
|
industry_examples = "" |
|
|
if "clean tech" in industry.lower() or "green" in industry.lower(): |
|
|
industry_examples = "(renewable energy, carbon capture, waste reduction, sustainable materials, etc.)" |
|
|
elif "ai" in industry.lower() or "ml" in industry.lower(): |
|
|
industry_examples = "(machine learning, artificial intelligence, natural language processing, computer vision, etc.)" |
|
|
elif "fintech" in industry.lower(): |
|
|
industry_examples = "(payments, banking, investment platforms, cryptocurrency, financial software, etc.)" |
|
|
elif "healthcare" in industry.lower() or "health" in industry.lower(): |
|
|
industry_examples = "(medical devices, health tech, biotech, telemedicine, health software, etc.)" |
|
|
elif "saas" in industry.lower(): |
|
|
industry_examples = "(B2B software, enterprise tools, cloud platforms, productivity software, etc.)" |
|
|
else: |
|
|
industry_examples = f"({industry} related technologies and services)" |
|
|
|
|
|
prompt = ( |
|
|
"Return only a valid JSON array of objects with exactly two fields: " |
|
|
"company_name, domain.\n\n" |
|
|
f"Find {num_companies} real, actively operating {industry} companies based in the {location} area. " |
|
|
"These should be companies you have high confidence actually exist.\n\n" |
|
|
|
|
|
"DOMAIN REQUIREMENTS:\n" |
|
|
"- Provide the company's primary website domain (e.g., 'acmesolar.com', NOT 'www.acmesolar.com' or 'https://acmesolar.com')\n" |
|
|
"- The domain should be the company's actual corporate domain\n" |
|
|
"- Do NOT include protocol (http/https) or subdomains (www)\n" |
|
|
"- ONLY include domains you are highly confident are correct\n" |
|
|
"- If you cannot find the correct domain for a company, SKIP IT entirely\n\n" |
|
|
|
|
|
"COMPANY REQUIREMENTS:\n" |
|
|
f"- Only include companies working in {industry} {industry_examples}\n" |
|
|
f"- Companies must be based in or have significant presence in {location}\n" |
|
|
"- **CRITICAL: ONLY include startups and early-stage companies (NOT established enterprises)**\n" |
|
|
"- Startups typically have more open internship opportunities and are more responsive\n" |
|
|
"- Focus on companies with 10-200 employees (smaller is better)\n" |
|
|
"- Prefer recently founded companies (last 10 years) that are actively growing\n" |
|
|
"- Only include companies you have high confidence are real and currently operating\n\n" |
|
|
|
|
|
"QUALITY OVER QUANTITY:\n" |
|
|
f"- It is better to return fewer than {num_companies} companies with REAL domains\n" |
|
|
f"- than to return {num_companies} companies with guessed or uncertain domains\n" |
|
|
"- Each entry should represent a real company you can verify exists\n\n" |
|
|
|
|
|
"OUTPUT FORMAT:\n" |
|
|
"- Output only valid JSON with no markdown, explanations, or commentary\n" |
|
|
"- Example: [{\"company_name\": \"Acme Solar\", \"domain\": \"acmesolar.com\"}]\n" |
|
|
"- Example: [{\"company_name\": \"Green Energy Solutions\", \"domain\": \"greenenergysolutions.com\"}]" |
|
|
) |
|
|
|
|
|
response_text = client.create_message(prompt, max_tokens=4096) |
|
|
|
|
|
|
|
|
|
|
|
cleaned_text = response_text.strip() |
|
|
if cleaned_text.startswith("```json"): |
|
|
cleaned_text = cleaned_text[7:] |
|
|
elif cleaned_text.startswith("```"): |
|
|
cleaned_text = cleaned_text[3:] |
|
|
if cleaned_text.endswith("```"): |
|
|
cleaned_text = cleaned_text[:-3] |
|
|
cleaned_text = cleaned_text.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
companies = json.loads(cleaned_text) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"Failed to parse JSON. Error: {e}") |
|
|
print(f"Cleaned text that failed to parse:\n{cleaned_text}") |
|
|
raise |
|
|
|
|
|
|
|
|
if not isinstance(companies, list): |
|
|
raise ValueError(f"Expected list of companies, but got {type(companies).__name__}") |
|
|
|
|
|
for i, item in enumerate(companies): |
|
|
if not isinstance(item, dict): |
|
|
raise ValueError(f"Expected dict at index {i}, but got {type(item).__name__}: {item}") |
|
|
|
|
|
|
|
|
if 'company_name' not in item: |
|
|
raise ValueError(f"Company at index {i} missing 'company_name' field: {item}") |
|
|
if 'domain' not in item: |
|
|
raise ValueError(f"Company at index {i} missing 'domain' field: {item}") |
|
|
|
|
|
return companies |
|
|
|
|
|
|
|
|
def enrichCompaniesWithHunter(companies): |
|
|
""" |
|
|
Uses Hunter.io Company Enrichment API to find email addresses and contact names |
|
|
for a list of companies. Returns ONLY 1 contact per company. |
|
|
|
|
|
Args: |
|
|
companies: List of dicts with keys: company_name, domain |
|
|
|
|
|
Returns: |
|
|
list: List of dicts with keys: company_name, contact_name, email_address |
|
|
Only includes companies where valid contacts were found (1 contact per company) |
|
|
""" |
|
|
contacts = [] |
|
|
|
|
|
for company in companies: |
|
|
company_name = company['company_name'] |
|
|
domain = company['domain'] |
|
|
|
|
|
print(f"Searching for contacts at {company_name} ({domain})...") |
|
|
|
|
|
try: |
|
|
|
|
|
url = f"https://api.hunter.io/v2/domain-search" |
|
|
params = { |
|
|
'domain': domain, |
|
|
'api_key': HUNTER_API_KEY, |
|
|
'limit': 10 |
|
|
} |
|
|
|
|
|
response = requests.get(url, params=params) |
|
|
response.raise_for_status() |
|
|
|
|
|
data = response.json() |
|
|
|
|
|
|
|
|
if 'data' not in data: |
|
|
print(f" No data returned for {company_name}") |
|
|
continue |
|
|
|
|
|
emails_data = data['data'].get('emails', []) |
|
|
|
|
|
if not emails_data: |
|
|
print(f" No emails found for {company_name}") |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
resume_specific_addresses = ['resume', 'resumes', 'careers', 'jobs', 'hiring', 'intern', 'internships', 'talent'] |
|
|
role_priority = ['hr', 'recruiting', 'talent', 'careers', 'people'] |
|
|
|
|
|
|
|
|
def email_score(email_info): |
|
|
position = email_info.get('position', '').lower() |
|
|
email = email_info.get('value', '').lower() |
|
|
email_local_part = email.split('@')[0] if '@' in email else email |
|
|
|
|
|
|
|
|
score = 0 |
|
|
|
|
|
|
|
|
for resume_keyword in resume_specific_addresses: |
|
|
if email_local_part == resume_keyword or email_local_part.startswith(resume_keyword): |
|
|
score += 20 |
|
|
break |
|
|
|
|
|
|
|
|
for role in role_priority: |
|
|
if role in position or role in email: |
|
|
score += 10 |
|
|
|
|
|
|
|
|
if email_info.get('first_name') and email_info.get('last_name'): |
|
|
score += 5 |
|
|
|
|
|
|
|
|
if email_info.get('verification', {}).get('status') == 'valid': |
|
|
score += 3 |
|
|
|
|
|
return score |
|
|
|
|
|
sorted_emails = sorted(emails_data, key=email_score, reverse=True) |
|
|
|
|
|
|
|
|
for email_info in sorted_emails[:1]: |
|
|
email_address = email_info.get('value') |
|
|
first_name = email_info.get('first_name') |
|
|
last_name = email_info.get('last_name') |
|
|
|
|
|
|
|
|
if first_name and last_name: |
|
|
contact_name = f"{first_name} {last_name}" |
|
|
elif first_name: |
|
|
contact_name = first_name |
|
|
else: |
|
|
contact_name = None |
|
|
|
|
|
contact = { |
|
|
'company_name': company_name, |
|
|
'contact_name': contact_name, |
|
|
'email_address': email_address |
|
|
} |
|
|
|
|
|
contacts.append(contact) |
|
|
print(f" Found: {contact_name or 'Generic email'} - {email_address}") |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f" Error fetching data for {company_name}: {e}") |
|
|
continue |
|
|
except Exception as e: |
|
|
print(f" Unexpected error for {company_name}: {e}") |
|
|
continue |
|
|
|
|
|
print(f"\nTotal contacts found: {len(contacts)}") |
|
|
return contacts |
|
|
|
|
|
|
|
|
def createEmailsUsingClaude(contacts, resume_path, api_key=None, industry="Clean Tech", custom_message=""): |
|
|
""" |
|
|
Uses OpenRouter API to generate personalized emails for each contact. |
|
|
|
|
|
Args: |
|
|
contacts: List of contact dicts from askClaudeToFindContacts |
|
|
resume_path: Path to PDF resume file |
|
|
api_key: Deprecated, kept for backwards compatibility |
|
|
industry: Industry type to tailor email content (default: "Clean Tech") |
|
|
custom_message: Optional custom message to incorporate into emails (default: "") |
|
|
|
|
|
Returns: |
|
|
list: List of dicts with company_name, contact_name, email_address, email_body |
|
|
""" |
|
|
client = get_client() |
|
|
|
|
|
|
|
|
resume_text = extract_text_from_pdf(resume_path) |
|
|
if not resume_text: |
|
|
raise ValueError(f"Could not extract text from resume at {resume_path}") |
|
|
|
|
|
|
|
|
contact_text = json.dumps(contacts, indent=2) |
|
|
|
|
|
prompt = ( |
|
|
f"You are helping draft personalized internship outreach emails for companies in the {industry} industry. " |
|
|
f"For each company listed below, create a tailored email that:\n\n" |
|
|
f"1. References specific work or projects the company is doing in {industry}\n" |
|
|
f"2. Connects the applicant's background (found in the resume) to the company's mission\n" |
|
|
f"3. Sounds authentic, human, and genuinely interested (NOT AI-generated)\n" |
|
|
f"4. Is professional but warm and conversational\n" |
|
|
f"5. Asks for internship opportunities without being pushy\n\n" |
|
|
f"6. Keeps the email concise (150-200 words)\n\n" |
|
|
f"7. Does not fabricate any information about the company or the applicant\n\n" |
|
|
f"{f'8. Incorporates this specific message/requirement: {custom_message}' if custom_message else ''}\n\n" |
|
|
f"RESUME CONTENT:\n{resume_text}\n\n" |
|
|
f"Example email structure (adapt this based on the resume and each company):\n\n" |
|
|
f"Hi [Company Name Team],\n\n" |
|
|
f"I hope you're well. My name is [Name from resume], and I'm a [major/background from resume] student at [university from resume]. " |
|
|
f"I recently came across [Company Name]'s work on [specific project/technology in {industry}] and was fascinated by [specific technical aspect]. " |
|
|
f"I've spent time working on [relevant experience from resume], and I'd love to see how these skills might apply in a real-world, high-impact setting like yours. " |
|
|
f"My interest is to learn from experienced teams and contribute in any way I can, however small. " |
|
|
f"If there is a way for me to get involved with the technical side at [Company Name], I'd be grateful for the chance to discuss.\n\n" |
|
|
f"I've attached my resume for reference. Thank you very much for considering this note, and I appreciate any time or advice you can offer.\n\n" |
|
|
f"Best,\n[Name from resume]\n\n" |
|
|
f"IMPORTANT:\n" |
|
|
f"- Research each company and reference their actual work in {industry}\n" |
|
|
f"- Extract the applicant's name, university, and major from the resume\n" |
|
|
f"- Match skills from the resume to each company's focus area\n" |
|
|
f"- Make each email unique - no copy-paste language between companies\n" |
|
|
f"- Keep emails concise (150-200 words)\n\n" |
|
|
f"Company contacts:\n{contact_text}\n\n" |
|
|
f"Return a JSON array with the same contacts but add an 'email_body' field containing the tailored email body. " |
|
|
f"Do not include subject line or attachment information. Return only valid JSON with no additional text." |
|
|
) |
|
|
|
|
|
response_text = client.create_message(prompt, max_tokens=8000) |
|
|
|
|
|
|
|
|
|
|
|
cleaned_text = response_text.strip() |
|
|
if cleaned_text.startswith("```json"): |
|
|
cleaned_text = cleaned_text[7:] |
|
|
elif cleaned_text.startswith("```"): |
|
|
cleaned_text = cleaned_text[3:] |
|
|
if cleaned_text.endswith("```"): |
|
|
cleaned_text = cleaned_text[:-3] |
|
|
cleaned_text = cleaned_text.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
emails_with_bodies = json.loads(cleaned_text) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"Failed to parse JSON. Error: {e}") |
|
|
print(f"Cleaned text that failed to parse:\n{cleaned_text}") |
|
|
raise |
|
|
|
|
|
|
|
|
if not isinstance(emails_with_bodies, list): |
|
|
raise ValueError(f"Expected list of contacts, but got {type(emails_with_bodies).__name__}") |
|
|
|
|
|
for i, item in enumerate(emails_with_bodies): |
|
|
if not isinstance(item, dict): |
|
|
raise ValueError(f"Expected dict at index {i}, but got {type(item).__name__}: {item}") |
|
|
|
|
|
|
|
|
if 'email_address' not in item: |
|
|
raise ValueError(f"Contact at index {i} missing 'email_address' field: {item}") |
|
|
if 'email_body' not in item: |
|
|
raise ValueError(f"Contact at index {i} missing 'email_body' field: {item}") |
|
|
|
|
|
return emails_with_bodies |
|
|
|
|
|
|
|
|
def main( |
|
|
sender_email, |
|
|
sender_password, |
|
|
user_id=None, |
|
|
user_emails_sent=None, |
|
|
user_domains_contacted=None, |
|
|
resume_path="Sumedh_Kothari_Resume.pdf", |
|
|
location="Atlanta", |
|
|
industry="Clean Tech", |
|
|
num_emails=5, |
|
|
custom_message="", |
|
|
progress_callback=None, |
|
|
max_attempts=10 |
|
|
): |
|
|
""" |
|
|
Execute the complete workflow from contact discovery to email sending. |
|
|
Uses Claude to find companies, then Hunter.io to find contacts. |
|
|
Loops until the desired number of unique emails is found or max attempts is reached. |
|
|
|
|
|
Args: |
|
|
sender_email: User's email address for sending emails (SMTP server auto-detected from domain) |
|
|
sender_password: User's email password or app-specific password |
|
|
user_id: User ID for tracking sent emails (optional) |
|
|
user_emails_sent: Set of emails already sent by this user (optional) |
|
|
user_domains_contacted: Set of domains already contacted by this user (optional) |
|
|
resume_path: Path to resume PDF file (default: "Sumedh_Kothari_Resume.pdf") |
|
|
location: City or region to search for companies (default: "Atlanta") |
|
|
industry: Industry type to target (default: "Clean Tech") |
|
|
num_emails: Number of unique emails to send (default: 5) |
|
|
custom_message: Optional custom message to include in emails (default: "") |
|
|
progress_callback: Optional callback function for progress updates (default: None) |
|
|
Signature: callback(message, msg_type='in-progress', count=None) |
|
|
max_attempts: Maximum number of search attempts (default: 10) |
|
|
|
|
|
Returns: |
|
|
dict: Email sending results with success/failure counts and emails_sent list |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
if user_emails_sent is None: |
|
|
user_emails_sent = set() |
|
|
if user_domains_contacted is None: |
|
|
user_domains_contacted = set() |
|
|
|
|
|
|
|
|
|
|
|
excel_emails, excel_domains = load_legacy_excel_emails() |
|
|
|
|
|
|
|
|
user_emails_sent = user_emails_sent | excel_emails |
|
|
user_domains_contacted = user_domains_contacted | excel_domains |
|
|
|
|
|
if len(excel_emails) > 0: |
|
|
print(f"Merged {len(excel_emails)} legacy emails from Excel with {len(user_emails_sent) - len(excel_emails)} database emails") |
|
|
print(f"Total emails to avoid: {len(user_emails_sent)}") |
|
|
print(f"Total domains to avoid: {len(user_domains_contacted)}") |
|
|
|
|
|
|
|
|
def progress(msg, msg_type='in-progress', count=None): |
|
|
print(msg) |
|
|
if progress_callback: |
|
|
progress_callback(msg, msg_type, count) |
|
|
|
|
|
|
|
|
all_unique_contacts = [] |
|
|
session_emails = set() |
|
|
session_domains = set() |
|
|
attempt = 0 |
|
|
batch_size = max(5, num_emails) |
|
|
|
|
|
progress(f"Starting search for {num_emails} unique {industry} contacts in {location}...", 'in-progress') |
|
|
|
|
|
|
|
|
while len(all_unique_contacts) < num_emails and attempt < max_attempts: |
|
|
attempt += 1 |
|
|
progress(f"Search attempt {attempt}/{max_attempts} (found {len(all_unique_contacts)}/{num_emails} unique contacts so far)...", 'in-progress') |
|
|
|
|
|
|
|
|
progress(f"Searching for {batch_size} {industry} companies...", 'in-progress') |
|
|
companies = askClaudeToFindCompanies(location=location, industry=industry, num_companies=batch_size) |
|
|
progress(f"Found {len(companies)} companies", 'success') |
|
|
|
|
|
if len(companies) == 0: |
|
|
progress("No companies found in this batch", 'error') |
|
|
break |
|
|
|
|
|
|
|
|
progress(f"Finding email contacts for {len(companies)} companies...", 'in-progress') |
|
|
contacts = enrichCompaniesWithHunter(companies) |
|
|
progress(f"Found {len(contacts)} email contacts", 'success') |
|
|
|
|
|
if len(contacts) == 0: |
|
|
progress("No email contacts found in this batch", 'error') |
|
|
continue |
|
|
|
|
|
|
|
|
progress("Removing duplicates and previously contacted companies...", 'in-progress') |
|
|
|
|
|
|
|
|
combined_emails = user_emails_sent | session_emails |
|
|
combined_domains = user_domains_contacted | session_domains |
|
|
|
|
|
cleaned_contacts = FindEmailWorkFlowV2.main(contacts, combined_emails, combined_domains) |
|
|
progress(f"After deduplication: {len(cleaned_contacts)} new contacts in this batch", 'success') |
|
|
|
|
|
if len(cleaned_contacts) == 0: |
|
|
progress("No new unique contacts in this batch (all were duplicates)", 'in-progress') |
|
|
continue |
|
|
|
|
|
|
|
|
for contact in cleaned_contacts: |
|
|
if len(all_unique_contacts) >= num_emails: |
|
|
break |
|
|
|
|
|
all_unique_contacts.append(contact) |
|
|
session_emails.add(contact['email_address']) |
|
|
|
|
|
|
|
|
domain = contact['email_address'].split('@')[1] if '@' in contact['email_address'] else None |
|
|
if domain: |
|
|
session_domains.add(domain) |
|
|
|
|
|
progress(f"Total unique contacts collected: {len(all_unique_contacts)}/{num_emails}", 'in-progress') |
|
|
|
|
|
|
|
|
if len(all_unique_contacts) >= num_emails: |
|
|
progress(f"Successfully found {len(all_unique_contacts)} unique contacts!", 'success') |
|
|
break |
|
|
|
|
|
|
|
|
if len(all_unique_contacts) == 0: |
|
|
progress("Unable to find any new unique contacts. All available contacts have already been contacted or no contacts were found.", 'error') |
|
|
return {"successful": 0, "failed": 0, "total": 0, "emails_sent": []} |
|
|
|
|
|
if len(all_unique_contacts) < num_emails: |
|
|
progress(f"Could only find {len(all_unique_contacts)} unique contacts after {attempt} attempts. All other available contacts in {location} {industry} have already been contacted or could not be found.", 'error') |
|
|
|
|
|
|
|
|
|
|
|
final_contacts = all_unique_contacts[:num_emails] |
|
|
|
|
|
|
|
|
progress(f"Generating {len(final_contacts)} personalized emails using AI...", 'in-progress') |
|
|
emails_with_bodies = createEmailsUsingClaude(final_contacts, resume_path, industry=industry, custom_message=custom_message) |
|
|
progress(f"Created {len(emails_with_bodies)} personalized emails", 'success') |
|
|
|
|
|
|
|
|
progress("Sending emails (this may take a few minutes)...", 'in-progress') |
|
|
results = SendEmailWorkFlowV2.main( |
|
|
emails_with_bodies, |
|
|
resume_path, |
|
|
sender_email, |
|
|
sender_password |
|
|
) |
|
|
|
|
|
|
|
|
emails_sent = [contact['email_address'] for contact in final_contacts] |
|
|
|
|
|
|
|
|
final_results = { |
|
|
"successful": results.get("success", 0), |
|
|
"failed": results.get("failure", 0), |
|
|
"total": results.get("total", 0), |
|
|
"emails_sent": emails_sent, |
|
|
"contacts_data": emails_with_bodies |
|
|
} |
|
|
|
|
|
progress(f"Emails sent successfully: {final_results['successful']}/{final_results['total']}", 'success', final_results['successful']) |
|
|
return final_results |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
print("This script should be run through the web application.") |
|
|
print("To test standalone, call main() with required parameters.") |
|
|
|