curiousgeorge1292 commited on
Commit
7f372a3
·
verified ·
1 Parent(s): 43b7cfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -4
app.py CHANGED
@@ -2,6 +2,9 @@ import os
2
  import requests
3
  import logging
4
  from bs4 import BeautifulSoup
 
 
 
5
  import gradio as gr
6
  from groq import Groq
7
  import json
@@ -241,10 +244,13 @@ def user_profile(email, name, professional_title, industry, target_audience, per
241
  save_user_info(email, name, professional_title, industry, target_audience, personal_background, company_url)
242
  return "Your information has been saved! Proceed to Step 2 for email generation."
243
 
244
- # Function to extract content from a URL
245
  def extract_content(url):
 
 
 
246
  try:
247
- response = requests.get(url, timeout=10)
248
  response.raise_for_status()
249
  soup = BeautifulSoup(response.text, 'html.parser')
250
  paragraphs = soup.find_all('p')
@@ -253,6 +259,31 @@ def extract_content(url):
253
  except Exception as e:
254
  return f"Error extracting content from {url}: {str(e)}"
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  # Function to fetch LinkedIn profile insights using Proxycurl API
257
  def fetch_linkedin_insights(profile_url):
258
  api_key = os.environ.get("PROXYCURL_API_KEY")
@@ -273,11 +304,29 @@ def fetch_linkedin_insights(profile_url):
273
  def generate_email(name, email, prospect_name, linkedin_url, website_url, context_url, word_count, email_purpose, interested_position, company_url, professional_title, personal_background):
274
  # Fetch insights from LinkedIn and reference URLs
275
  linkedin_insights = fetch_linkedin_insights(linkedin_url)
276
- website_content = extract_content(website_url)
 
 
 
 
 
 
 
 
 
277
  context_content = extract_content(context_url) if context_url else ""
278
 
279
  # Fetch details from the company website
280
- company_content = extract_content(company_url)
 
 
 
 
 
 
 
 
 
281
 
282
  # Construct the purpose-specific prompt
283
  if email_purpose == "Job Application":
 
2
  import requests
3
  import logging
4
  from bs4 import BeautifulSoup
5
+ import time
6
+ import xml.etree.ElementTree as ET
7
+ from urllib.parse import urljoin
8
  import gradio as gr
9
  from groq import Groq
10
  import json
 
244
  save_user_info(email, name, professional_title, industry, target_audience, personal_background, company_url)
245
  return "Your information has been saved! Proceed to Step 2 for email generation."
246
 
247
+ # Helper function to extract content from a URL
248
  def extract_content(url):
249
+ headers = {
250
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
251
+ }
252
  try:
253
+ response = requests.get(url, headers=headers, timeout=10)
254
  response.raise_for_status()
255
  soup = BeautifulSoup(response.text, 'html.parser')
256
  paragraphs = soup.find_all('p')
 
259
  except Exception as e:
260
  return f"Error extracting content from {url}: {str(e)}"
261
 
262
+ # Helper function to parse a sitemap and get valid URLs
263
+ def parse_sitemap(sitemap_url):
264
+ headers = {
265
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
266
+ }
267
+ try:
268
+ response = requests.get(sitemap_url, headers=headers, timeout=10)
269
+ response.raise_for_status()
270
+ urls = []
271
+ root = ET.fromstring(response.content)
272
+ for loc in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
273
+ urls.append(loc.text)
274
+ return urls
275
+ except Exception as e:
276
+ return f"Error parsing sitemap from {sitemap_url}: {str(e)}"
277
+
278
+ # Wrapper to handle retries and delay
279
+ def safe_extract_content(url, delay=2):
280
+ content = extract_content(url)
281
+ if "Error extracting content" in content:
282
+ print(content) # Log the error
283
+ return None
284
+ time.sleep(delay) # Respect crawl-delay
285
+ return content
286
+
287
  # Function to fetch LinkedIn profile insights using Proxycurl API
288
  def fetch_linkedin_insights(profile_url):
289
  api_key = os.environ.get("PROXYCURL_API_KEY")
 
304
  def generate_email(name, email, prospect_name, linkedin_url, website_url, context_url, word_count, email_purpose, interested_position, company_url, professional_title, personal_background):
305
  # Fetch insights from LinkedIn and reference URLs
306
  linkedin_insights = fetch_linkedin_insights(linkedin_url)
307
+ website_sitemap_url = urljoin(website_url, "sitemap_index.xml")
308
+ website_content = safe_extract_content(website_url)
309
+ if not website_content:
310
+ # If direct scraping fails, fall back to the sitemap
311
+ website_urls = parse_sitemap(website_sitemap_url)
312
+ if isinstance(urls, list):
313
+ for url in urls:
314
+ website_content = safe_extract_content(url)
315
+ if website_content:
316
+ break
317
  context_content = extract_content(context_url) if context_url else ""
318
 
319
  # Fetch details from the company website
320
+ company_sitemap_url = urljoin(company_url, "sitemap_index.xml")
321
+ company_content = safe_extract_content(company_url)
322
+ if not company_content:
323
+ # If direct scraping fails, fall back to the sitemap
324
+ company_urls = parse_sitemap(company_sitemap_url)
325
+ if isinstance(urls, list):
326
+ for url in urls:
327
+ company_content = safe_extract_content(url)
328
+ if company_content:
329
+ break
330
 
331
  # Construct the purpose-specific prompt
332
  if email_purpose == "Job Application":