Custom_Profile_Email_Generator

Sleeping

App Files Files Community

curiousgeorge1292 commited on Jan 15, 2025

Commit

7f372a3

verified ·

1 Parent(s): 43b7cfd

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -4

app.py CHANGED Viewed

@@ -2,6 +2,9 @@ import os
 import requests
 import logging
 from bs4 import BeautifulSoup
 import gradio as gr
 from groq import Groq
 import json
@@ -241,10 +244,13 @@ def user_profile(email, name, professional_title, industry, target_audience, per
     save_user_info(email, name, professional_title, industry, target_audience, personal_background, company_url)
     return "Your information has been saved! Proceed to Step 2 for email generation."
-# Function to extract content from a URL
 def extract_content(url):
     try:
-        response = requests.get(url, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         paragraphs = soup.find_all('p')
@@ -253,6 +259,31 @@ def extract_content(url):
     except Exception as e:
         return f"Error extracting content from {url}: {str(e)}"
 # Function to fetch LinkedIn profile insights using Proxycurl API
 def fetch_linkedin_insights(profile_url):
     api_key = os.environ.get("PROXYCURL_API_KEY")
@@ -273,11 +304,29 @@ def fetch_linkedin_insights(profile_url):
 def generate_email(name, email, prospect_name, linkedin_url, website_url, context_url, word_count, email_purpose, interested_position, company_url, professional_title, personal_background):
     # Fetch insights from LinkedIn and reference URLs
     linkedin_insights = fetch_linkedin_insights(linkedin_url)
-    website_content = extract_content(website_url)
     context_content = extract_content(context_url) if context_url else ""
     # Fetch details from the company website
-    company_content = extract_content(company_url)
     # Construct the purpose-specific prompt
     if email_purpose == "Job Application":

 import requests
 import logging
 from bs4 import BeautifulSoup
+import time
+import xml.etree.ElementTree as ET
+from urllib.parse import urljoin
 import gradio as gr
 from groq import Groq
 import json
     save_user_info(email, name, professional_title, industry, target_audience, personal_background, company_url)
     return "Your information has been saved! Proceed to Step 2 for email generation."
+# Helper function to extract content from a URL
 def extract_content(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
     try:
+        response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         paragraphs = soup.find_all('p')
     except Exception as e:
         return f"Error extracting content from {url}: {str(e)}"
+# Helper function to parse a sitemap and get valid URLs
+def parse_sitemap(sitemap_url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+    }
+    try:
+        response = requests.get(sitemap_url, headers=headers, timeout=10)
+        response.raise_for_status()
+        urls = []
+        root = ET.fromstring(response.content)
+        for loc in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
+            urls.append(loc.text)
+        return urls
+    except Exception as e:
+        return f"Error parsing sitemap from {sitemap_url}: {str(e)}"
+# Wrapper to handle retries and delay
+def safe_extract_content(url, delay=2):
+    content = extract_content(url)
+    if "Error extracting content" in content:
+        print(content)  # Log the error
+        return None
+    time.sleep(delay)  # Respect crawl-delay
+    return content
 # Function to fetch LinkedIn profile insights using Proxycurl API
 def fetch_linkedin_insights(profile_url):
     api_key = os.environ.get("PROXYCURL_API_KEY")
 def generate_email(name, email, prospect_name, linkedin_url, website_url, context_url, word_count, email_purpose, interested_position, company_url, professional_title, personal_background):
     # Fetch insights from LinkedIn and reference URLs
     linkedin_insights = fetch_linkedin_insights(linkedin_url)
+    website_sitemap_url = urljoin(website_url, "sitemap_index.xml")
+    website_content = safe_extract_content(website_url)
+    if not website_content:
+    # If direct scraping fails, fall back to the sitemap
+    website_urls = parse_sitemap(website_sitemap_url)
+    if isinstance(urls, list):
+        for url in urls:
+            website_content = safe_extract_content(url)
+            if website_content:
+                break
     context_content = extract_content(context_url) if context_url else ""
     # Fetch details from the company website
+    company_sitemap_url = urljoin(company_url, "sitemap_index.xml")
+    company_content = safe_extract_content(company_url)
+    if not company_content:
+    # If direct scraping fails, fall back to the sitemap
+    company_urls = parse_sitemap(company_sitemap_url)
+    if isinstance(urls, list):
+        for url in urls:
+            company_content = safe_extract_content(url)
+            if company_content:
+                break
     # Construct the purpose-specific prompt
     if email_purpose == "Job Application":