sa / feature_api.py
Varun Israni
c
69ae464
import logging
from datetime import datetime
from firecrawl import FirecrawlApp
import json
import os
import requests
import time
import google.generativeai as genai
# Initialize logging
logging.basicConfig(level=logging.DEBUG)
# Initialize Firecrawl
FIRECRAWL_API_KEY = "fc-b69d6504ab0a42b79e87b7827a538199"
firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
logging.info("Firecrawl initialized")
# Initialize Gemini
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
if GOOGLE_API_KEY:
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')
logging.info("Gemini initialized")
else:
logging.warning("No Gemini API key found")
# Create a folder to store Gemini outputs
output_folder = 'gemini_outputs'
os.makedirs(output_folder, exist_ok=True)
def extract_domain(url):
"""Extract domain name from URL"""
try:
from urllib.parse import urlparse
domain = urlparse(url).netloc
return domain.replace('www.', '')
except:
return url
def get_feature_data(business_query):
"""Get feature priority data using custom search API and Firecrawl"""
logging.info(f"\n{'='*50}\nGathering feature data for: {business_query}\n{'='*50}")
result = {
"social_impact": [],
"economic_impact": [],
"environmental_impact": [],
"implementation_priority": [],
"sources": []
}
search_queries = [
f"{business_query} product features analysis",
f"{business_query} feature prioritization",
f"{business_query} product roadmap",
f"{business_query} user requirements",
f"{business_query} product development priorities"
]
scraped_content = []
max_attempts = 2
for query in search_queries:
try:
logging.info(f"\nSearching for: {query}")
search_results = custom_search_api(query)
attempts = 0
for url in search_results:
if attempts >= max_attempts:
break
if not any(x in url.lower() for x in ['linkedin', 'facebook', 'twitter']):
try:
logging.info(f"Scraping: {url}")
response = firecrawl_app.scrape_url(
url=url,
params={'formats': ['markdown']}
)
if response and 'markdown' in response:
content = response['markdown']
if len(content) > 200:
logging.info("Successfully scraped content")
scraped_content.append({
'url': url,
'domain': extract_domain(url),
'section': 'Feature Analysis',
'date': datetime.now().strftime("%Y-%m-%d"),
'content': content[:1000]
})
break
except Exception as e:
if "402" in str(e):
logging.warning(f"Firecrawl credit limit reached for {url}")
scraped_content.append({
'url': url,
'domain': extract_domain(url),
'section': 'Feature Analysis (Limited)',
'date': datetime.now().strftime("%Y-%m-%d"),
'content': f"Content from {extract_domain(url)} about {business_query}'s features"
})
else:
logging.error(f"Error scraping {url}: {str(e)}")
attempts += 1
continue
time.sleep(2)
except Exception as e:
logging.error(f"Error in search: {str(e)}")
continue
if scraped_content:
try:
prompt = f"""
Analyze this content about {business_query}'s features and create a detailed priority analysis.
Content to analyze:
{[item['content'] for item in scraped_content]}
Provide a structured analysis with these exact sections:
SOCIAL IMPACT:
• Community Benefits
• Employment Impact
• Social Value
ECONOMIC IMPACT:
• Revenue Generation
• Market Growth
• Innovation Impact
ENVIRONMENTAL IMPACT:
• Sustainability
• Resource Usage
• Carbon Footprint
IMPLEMENTATION PRIORITY:
• Timeline
• Resources
• Success Metrics
Use factual information where available, mark inferences with (Inferred).
Format each point as a clear, actionable item.
"""
response = model.generate_content(prompt)
analysis = response.text
# Save Gemini output to a text file
output_file_path = os.path.join(output_folder, 'compitoone.txt')
with open(output_file_path, 'w') as output_file:
output_file.write(analysis)
logging.info(f"Gemini output saved to {output_file_path}")
# Extract sections
result["social_impact"] = extract_section(analysis, "SOCIAL IMPACT")
result["economic_impact"] = extract_section(analysis, "ECONOMIC IMPACT")
result["environmental_impact"] = extract_section(analysis, "ENVIRONMENTAL IMPACT")
result["implementation_priority"] = extract_section(analysis, "IMPLEMENTATION PRIORITY")
# Add sources
result["sources"] = [{
'url': item['url'],
'domain': item['domain'],
'section': item['section'],
'date': item['date']
} for item in scraped_content]
return result
except Exception as e:
logging.error(f"Error generating analysis: {str(e)}")
return generate_fallback_response(business_query)
return generate_fallback_response(business_query)
def custom_search_api(query):
"""Perform a custom search using the Google Custom Search API"""
api_key = "AIzaSyAxeLlJ6vZxOl-TblUJg_dInBS3vNxaFVY"
search_engine_id = "37793b12975da4e35"
url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&num=2"
response = requests.get(url)
if response.status_code == 200:
search_results = response.json().get('items', [])
return [item['link'] for item in search_results]
else:
logging.error(f"Error in custom search API: {response.status_code} - {response.text}")
return []
def extract_section(text, section_name):
"""Extract content from a specific section"""
try:
lines = []
in_section = False
for line in text.split('\n'):
if section_name + ":" in line:
in_section = True
continue
elif any(s + ":" in line for s in ["SOCIAL IMPACT", "ECONOMIC IMPACT", "ENVIRONMENTAL IMPACT", "IMPLEMENTATION PRIORITY"]):
in_section = False
elif in_section and line.strip():
cleaned_line = line.strip('- *').strip()
if cleaned_line and not cleaned_line.endswith(':'):
lines.append(cleaned_line)
return lines
except Exception as e:
logging.error(f"Error extracting section {section_name}: {str(e)}")
return []
def generate_fallback_response(business_query):
"""Generate basic feature priority analysis when no data is found"""
return {
"social_impact": [
f"Community impact assessment for {business_query} pending (Inferred)",
"Employment effects to be evaluated (Inferred)",
"Social value contribution potential (Inferred)"
],
"economic_impact": [
"Revenue potential being assessed (Inferred)",
"Market growth opportunities pending analysis (Inferred)",
"Innovation impact to be determined (Inferred)"
],
"environmental_impact": [
"Sustainability initiatives to be evaluated (Inferred)",
"Resource usage assessment pending (Inferred)",
"Carbon footprint analysis needed (Inferred)"
],
"implementation_priority": [
"Timeline development in progress (Inferred)",
"Resource requirements being assessed (Inferred)",
"Success metrics to be defined (Inferred)"
],
"sources": []
}