Spaces:
No application file
No application file
| import logging | |
| from datetime import datetime | |
| from firecrawl import FirecrawlApp | |
| import json | |
| import os | |
| import requests | |
| import time | |
| import google.generativeai as genai | |
| # Initialize logging | |
| logging.basicConfig(level=logging.DEBUG) | |
| # Initialize Firecrawl | |
| FIRECRAWL_API_KEY = "fc-b69d6504ab0a42b79e87b7827a538199" | |
| firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY) | |
| logging.info("Firecrawl initialized") | |
| # Initialize Gemini | |
| GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '') | |
| if GOOGLE_API_KEY: | |
| genai.configure(api_key=GOOGLE_API_KEY) | |
| model = genai.GenerativeModel('gemini-1.5-flash') | |
| logging.info("Gemini initialized") | |
| else: | |
| logging.warning("No Gemini API key found") | |
| # Create a folder to store Gemini outputs | |
| output_folder = 'gemini_outputs' | |
| os.makedirs(output_folder, exist_ok=True) | |
| def extract_domain(url): | |
| """Extract domain name from URL""" | |
| try: | |
| from urllib.parse import urlparse | |
| domain = urlparse(url).netloc | |
| return domain.replace('www.', '') | |
| except: | |
| return url | |
| def get_feature_data(business_query): | |
| """Get feature priority data using custom search API and Firecrawl""" | |
| logging.info(f"\n{'='*50}\nGathering feature data for: {business_query}\n{'='*50}") | |
| result = { | |
| "social_impact": [], | |
| "economic_impact": [], | |
| "environmental_impact": [], | |
| "implementation_priority": [], | |
| "sources": [] | |
| } | |
| search_queries = [ | |
| f"{business_query} product features analysis", | |
| f"{business_query} feature prioritization", | |
| f"{business_query} product roadmap", | |
| f"{business_query} user requirements", | |
| f"{business_query} product development priorities" | |
| ] | |
| scraped_content = [] | |
| max_attempts = 2 | |
| for query in search_queries: | |
| try: | |
| logging.info(f"\nSearching for: {query}") | |
| search_results = custom_search_api(query) | |
| attempts = 0 | |
| for url in search_results: | |
| if attempts >= max_attempts: | |
| break | |
| if not any(x in url.lower() for x in ['linkedin', 'facebook', 'twitter']): | |
| try: | |
| logging.info(f"Scraping: {url}") | |
| response = firecrawl_app.scrape_url( | |
| url=url, | |
| params={'formats': ['markdown']} | |
| ) | |
| if response and 'markdown' in response: | |
| content = response['markdown'] | |
| if len(content) > 200: | |
| logging.info("Successfully scraped content") | |
| scraped_content.append({ | |
| 'url': url, | |
| 'domain': extract_domain(url), | |
| 'section': 'Feature Analysis', | |
| 'date': datetime.now().strftime("%Y-%m-%d"), | |
| 'content': content[:1000] | |
| }) | |
| break | |
| except Exception as e: | |
| if "402" in str(e): | |
| logging.warning(f"Firecrawl credit limit reached for {url}") | |
| scraped_content.append({ | |
| 'url': url, | |
| 'domain': extract_domain(url), | |
| 'section': 'Feature Analysis (Limited)', | |
| 'date': datetime.now().strftime("%Y-%m-%d"), | |
| 'content': f"Content from {extract_domain(url)} about {business_query}'s features" | |
| }) | |
| else: | |
| logging.error(f"Error scraping {url}: {str(e)}") | |
| attempts += 1 | |
| continue | |
| time.sleep(2) | |
| except Exception as e: | |
| logging.error(f"Error in search: {str(e)}") | |
| continue | |
| if scraped_content: | |
| try: | |
| prompt = f""" | |
| Analyze this content about {business_query}'s features and create a detailed priority analysis. | |
| Content to analyze: | |
| {[item['content'] for item in scraped_content]} | |
| Provide a structured analysis with these exact sections: | |
| SOCIAL IMPACT: | |
| • Community Benefits | |
| • Employment Impact | |
| • Social Value | |
| ECONOMIC IMPACT: | |
| • Revenue Generation | |
| • Market Growth | |
| • Innovation Impact | |
| ENVIRONMENTAL IMPACT: | |
| • Sustainability | |
| • Resource Usage | |
| • Carbon Footprint | |
| IMPLEMENTATION PRIORITY: | |
| • Timeline | |
| • Resources | |
| • Success Metrics | |
| Use factual information where available, mark inferences with (Inferred). | |
| Format each point as a clear, actionable item. | |
| """ | |
| response = model.generate_content(prompt) | |
| analysis = response.text | |
| # Save Gemini output to a text file | |
| output_file_path = os.path.join(output_folder, 'compitoone.txt') | |
| with open(output_file_path, 'w') as output_file: | |
| output_file.write(analysis) | |
| logging.info(f"Gemini output saved to {output_file_path}") | |
| # Extract sections | |
| result["social_impact"] = extract_section(analysis, "SOCIAL IMPACT") | |
| result["economic_impact"] = extract_section(analysis, "ECONOMIC IMPACT") | |
| result["environmental_impact"] = extract_section(analysis, "ENVIRONMENTAL IMPACT") | |
| result["implementation_priority"] = extract_section(analysis, "IMPLEMENTATION PRIORITY") | |
| # Add sources | |
| result["sources"] = [{ | |
| 'url': item['url'], | |
| 'domain': item['domain'], | |
| 'section': item['section'], | |
| 'date': item['date'] | |
| } for item in scraped_content] | |
| return result | |
| except Exception as e: | |
| logging.error(f"Error generating analysis: {str(e)}") | |
| return generate_fallback_response(business_query) | |
| return generate_fallback_response(business_query) | |
| def custom_search_api(query): | |
| """Perform a custom search using the Google Custom Search API""" | |
| api_key = "AIzaSyAxeLlJ6vZxOl-TblUJg_dInBS3vNxaFVY" | |
| search_engine_id = "37793b12975da4e35" | |
| url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&num=2" | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| search_results = response.json().get('items', []) | |
| return [item['link'] for item in search_results] | |
| else: | |
| logging.error(f"Error in custom search API: {response.status_code} - {response.text}") | |
| return [] | |
| def extract_section(text, section_name): | |
| """Extract content from a specific section""" | |
| try: | |
| lines = [] | |
| in_section = False | |
| for line in text.split('\n'): | |
| if section_name + ":" in line: | |
| in_section = True | |
| continue | |
| elif any(s + ":" in line for s in ["SOCIAL IMPACT", "ECONOMIC IMPACT", "ENVIRONMENTAL IMPACT", "IMPLEMENTATION PRIORITY"]): | |
| in_section = False | |
| elif in_section and line.strip(): | |
| cleaned_line = line.strip('- *').strip() | |
| if cleaned_line and not cleaned_line.endswith(':'): | |
| lines.append(cleaned_line) | |
| return lines | |
| except Exception as e: | |
| logging.error(f"Error extracting section {section_name}: {str(e)}") | |
| return [] | |
| def generate_fallback_response(business_query): | |
| """Generate basic feature priority analysis when no data is found""" | |
| return { | |
| "social_impact": [ | |
| f"Community impact assessment for {business_query} pending (Inferred)", | |
| "Employment effects to be evaluated (Inferred)", | |
| "Social value contribution potential (Inferred)" | |
| ], | |
| "economic_impact": [ | |
| "Revenue potential being assessed (Inferred)", | |
| "Market growth opportunities pending analysis (Inferred)", | |
| "Innovation impact to be determined (Inferred)" | |
| ], | |
| "environmental_impact": [ | |
| "Sustainability initiatives to be evaluated (Inferred)", | |
| "Resource usage assessment pending (Inferred)", | |
| "Carbon footprint analysis needed (Inferred)" | |
| ], | |
| "implementation_priority": [ | |
| "Timeline development in progress (Inferred)", | |
| "Resource requirements being assessed (Inferred)", | |
| "Success metrics to be defined (Inferred)" | |
| ], | |
| "sources": [] | |
| } |