Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import re | |
| from typing import Optional, Dict, Union, BinaryIO | |
| import requests | |
| from google import genai | |
| from google.genai import types | |
| from application.utils.logger import get_logger | |
| from application.services.gemini_api_service import upload_file | |
| from application.services.mongo_db_service import store_document | |
| from application.schemas.response_schema import GEMINI_GHG_PARAMETERS | |
| from langchain_core.tools import tool | |
| logger = get_logger() | |
| client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) | |
| MODEL = "gemini-2.0-flash" | |
| PROMPT = ( | |
| """You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format. | |
| Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing. | |
| ### Instructions: | |
| 1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder. | |
| 2. **Data Sources**: Extract data from all relevant sections of the PDF, including: | |
| - Narrative text | |
| - Tables | |
| - Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data) | |
| - Footnotes or appendices | |
| 3. **Infographic Handling**: For infographics, prioritize: | |
| - Text labels or annotations within the graphic | |
| - Captions or descriptions near the infographic | |
| - Legends or keys that clarify values | |
| - If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics. | |
| 4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data). | |
| 5. **Edge Cases**: | |
| - If data is missing, use placeholders as specified in the schema. | |
| - If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema. | |
| ### Output Requirements: | |
| - Return a JSON object adhering to the schema. | |
| - Ensure all fields are populated, using placeholders for missing data. | |
| - Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction. | |
| ### Task: | |
| - Parse the PDF thoroughly to extract all relevant data. | |
| - Ensure consistency in units, years, and terminology across the output. | |
| - Handle infographics with care, prioritizing textual data and flagging estimates. | |
| - Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions. | |
| """ | |
| ) | |
| def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]: | |
| """ | |
| Extracts emission-related ESG data from a PDF file using the Gemini API. | |
| This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini, | |
| sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON. | |
| Args: | |
| file_input (Union[BinaryIO, bytes, str]): | |
| The input file to process. Can be a file object, byte stream, or local file path. | |
| Returns: | |
| Optional[Dict]: | |
| A dictionary containing the extracted emission data if parsing succeeds, | |
| or a dictionary with the raw text response if JSON parsing fails. | |
| Returns None if the extraction process encounters an error. | |
| Raises: | |
| Exception: | |
| Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing. | |
| Notes: | |
| - The function automatically handles uploading if the file is not already present on Gemini. | |
| - If the response is not valid JSON, the raw response text is returned under the key "raw_response". | |
| - Token usage information (input, output, total tokens) is logged if available. | |
| """ | |
| try: | |
| uploaded_file = upload_file(file=file_input) | |
| response = client.models.generate_content( | |
| model=MODEL, | |
| contents=[uploaded_file, PROMPT], | |
| config={ | |
| 'response_mime_type': 'application/json', | |
| 'response_schema': GEMINI_GHG_PARAMETERS, | |
| 'temperature': 0.0, | |
| }, | |
| ) | |
| if hasattr(response, 'usage_metadata'): | |
| logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}") | |
| logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}") | |
| logger.info(f"Total tokens: {response.usage_metadata.total_token_count}") | |
| else: | |
| logger.info("Token usage metadata not available in response") | |
| logger.info("[Gemini] Response received.") | |
| try: | |
| result = json.loads(response.text) | |
| file_name = result.get('Company Name', 'Unknown Company') | |
| document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')} | |
| store_document(file_name, document) | |
| return json.loads(response.text) | |
| except json.JSONDecodeError: | |
| logger.warning("Failed to parse JSON, returning raw response.") | |
| return {"raw_response": response.text} | |
| except Exception as e: | |
| logger.exception("Error during ESG data extraction.") | |
| return None |