Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| from pydantic import BaseModel | |
| from typing import List | |
| from application.utils.logger import get_logger | |
| from typing import Literal | |
| from duckduckgo_search import DDGS | |
| from tavily import TavilyClient | |
| from langchain_core.tools import tool | |
| import ast | |
| logger = get_logger() | |
| load_dotenv() | |
| os.makedirs("reports", exist_ok=True) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) | |
| client = OpenAI(api_key=OPENAI_API_KEY) | |
| class CompanyListResponse(BaseModel): | |
| companies: List[str] | |
| # parsed_list = ['Puma', 'Gap', 'PVH Corp.', 'GUESS', 'Hugo Boss'] | |
| def get_top_companies_from_web(query: str): | |
| """ | |
| # Searches the web for a list of top companies based on a given query. | |
| Extracts the number of companies from the query if specified; defaults to 5 otherwise. | |
| Returns only the specified number of company names in a list format. | |
| Args: | |
| query (str): The search query from the user. | |
| Returns: | |
| CompanyListResponse: A structured list of top company names. | |
| """ | |
| prompt = ( | |
| f"{query} " | |
| "focusing on globally recognized companies known for size, influence, or sustainability efforts. " | |
| "Respond with a Python list of company names only, no explanation. " | |
| "Example: ['Company A', 'Company B', 'Company C']. " | |
| "Please do not include any other text or formatting." | |
| ) | |
| logger.info(f'User query : {query}') | |
| try: | |
| response = client.responses.create( | |
| model="gpt-4o-mini", | |
| tools=[{"type": "web_search_preview"}], | |
| input=prompt, | |
| ) | |
| output = response.output_text | |
| # logger.info(f"Raw Output: {output}") | |
| parsed_list = ast.literal_eval(output.strip()) | |
| # parsed_list = eval(output.strip()) | |
| logger.info(f"Parsed List: {parsed_list}") | |
| result = CompanyListResponse(companies=parsed_list) | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error parsing response: {e}") | |
| raise ValueError(f"Failed to parse company list: {output}") | |
| def get_sustainability_report_pdf( | |
| company_name: str, | |
| year: int | None = None, | |
| max_results: int = 1, | |
| search_engine: Literal["tavily", "duckduckgo", "both"] = "duckduckgo", | |
| ) -> str | None: | |
| """ | |
| Finds and returns the direct PDF link for the sustainability report of a SPECIFIC, NAMED company. | |
| Use this tool when the user provides the exact name of the company they want the report for. | |
| Optionally, a specific 'year' can be provided. | |
| Args: | |
| company_name (str): The name of the company. | |
| year (int, optional): The year of the sustainability report. Defaults to None. | |
| max_results (int, optional): Maximum number of fallback search results to fetch if using DuckDuckGo. Defaults to 1. | |
| search_engine (str, optional): Search engine to use. | |
| - "tavily" : only use Tavily search | |
| - "duckduckgo" : only use DuckDuckGo | |
| - "both" (default): try Tavily first, fallback to DuckDuckGo if needed | |
| Returns: | |
| str or None: The URL of the sustainability report PDF if found, otherwise None. | |
| Search Strategy: | |
| - Tavily: Searches with advanced search settings. | |
| - DuckDuckGo: Searches public web with 'filetype:pdf' filter. | |
| - Only URLs ending with '.pdf' are considered valid. | |
| Notes: | |
| - Any search failures are internally handled and logged. | |
| """ | |
| def search_with_tavily(query: str) -> str | None: | |
| try: | |
| logger.info(f"Searching Tavily for: {query}") | |
| result = tavily_client.search(query=query, search_depth="advanced",max_results=max_results) | |
| urls = [res["url"] for res in result.get("results", []) if res["url"].lower().endswith(".pdf")] | |
| if urls: | |
| logger.info(f"Found PDF via Tavily: {urls[0]}") | |
| return urls[0] | |
| logger.info("No PDF found via Tavily.") | |
| except Exception as e: | |
| logger.error(f"Tavily search error: {e}") | |
| return None | |
| def search_with_duckduckgo(query: str, max_results: int) -> str | None: | |
| try: | |
| logger.info(f"Searching DuckDuckGo for: {query}") | |
| with DDGS() as ddgs: | |
| search_results = ddgs.text(query.strip(), max_results=max_results) | |
| for result in search_results: | |
| pdf_url = result.get('href', '') | |
| if pdf_url.lower().endswith('.pdf'): | |
| logger.info(f"Found PDF via DuckDuckGo: {pdf_url}") | |
| return pdf_url | |
| else: | |
| logger.info(f"Skipped non-PDF link: {pdf_url}") | |
| except Exception as error: | |
| logger.error(f"DuckDuckGo search error: {error}") | |
| return None | |
| # Compose search query | |
| query = f"{company_name} sustainability report filetype:pdf" | |
| if year: | |
| query += f" {year}" | |
| logger.info(f"Starting sustainability report search for '{company_name}', year={year}, using '{search_engine}' engine.") | |
| # Perform search according to engine selection | |
| if search_engine == "tavily": | |
| return search_with_tavily(query) | |
| elif search_engine == "duckduckgo": | |
| return search_with_duckduckgo(query, max_results=max_results) | |
| elif search_engine == "both": | |
| pdf_url = search_with_tavily(query) | |
| if not pdf_url: | |
| pdf_url = search_with_duckduckgo(query, max_results=max_results) | |
| return pdf_url | |
| else: | |
| logger.error(f"Invalid search engine option provided: {search_engine}") | |
| raise ValueError(f"Invalid search engine '{search_engine}'. Choose from 'tavily', 'duckduckgo', or 'both'.") |