PTBV6in1 / agents /greenwash_utils.py
gaialive's picture
Upload 9 files
7d37b33 verified
import fitz # PyMuPDF for PDF extraction
import docx # python-docx for DOCX extraction
import requests
import json
import streamlit as st
def extract_text(uploaded_file):
"""Extracts text from an uploaded file (PDF or DOCX)."""
try:
if uploaded_file.type == "application/pdf":
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
return "\n".join(page.get_text() for page in doc)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = docx.Document(uploaded_file)
return "\n".join(para.text for para in doc.paragraphs)
else:
return "Error: Unsupported file type."
except Exception as e:
return f"Error processing file: {e}"
def chunk_text(text, chunk_size=4000, overlap=200):
"""Splits text into overlapping chunks."""
if not isinstance(text, str):
return []
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
def analyze_chunk(chunk):
"""Analyzes a single text chunk for greenwashing using the Groq API."""
try:
groq_api_key = st.secrets["GROQ_API_KEY"]
except FileNotFoundError:
return [{"type": "Configuration Error", "excerpt": "N/A", "explanation": "GROQ_API_KEY secret not found.", "suggestion": "Please ensure the GROQ_API_KEY is set in your Streamlit secrets."}]
prompt = f"""
You are a critical ESG analyst specializing in identifying greenwashing. Analyze the following text for potential greenwashing. For each issue, identify the type of greenwashing (e.g., Vague Language, Irrelevant Claims, Hidden Trade-offs, No Proof), extract the specific sentence or phrase, explain why it's a potential issue, and suggest a more transparent alternative.
Text to analyze:
'''{chunk}'''
Format your response as a valid JSON list of objects. Each object must have the keys: "type", "excerpt", "explanation", and "suggestion". If no issues are found, return an empty list [].
"""
api_url = "https://api.groq.com/openai/v1/chat/completions"
headers = {"Authorization": f"Bearer {groq_api_key}", "Content-Type": "application/json"}
payload = {
"model": "llama3-8b-8192",
"messages": [{"role": "user", "content": prompt}],
"response_format": {"type": "json_object"},
"max_tokens": 2048,
"temperature": 0.4
}
try:
response = requests.post(api_url, json=payload, headers=headers)
response.raise_for_status()
data = response.json()
issues_str = data["choices"][0]["message"]["content"]
# The content itself is a JSON string, so we parse it
parsed_issues = json.loads(issues_str)
# The actual list might be nested inside a key like 'issues'
if isinstance(parsed_issues, dict) and 'issues' in parsed_issues:
return parsed_issues['issues']
return parsed_issues # Assume it's the list itself
except requests.exceptions.RequestException:
return [{"type": "API Error", "excerpt": "N/A", "explanation": "Could not connect to the analysis service.", "suggestion": "Check your API key and network connection."}]
except (json.JSONDecodeError, KeyError, IndexError):
return [{"type": "API Response Error", "excerpt": "N/A", "explanation": "The analysis service returned an invalid response.", "suggestion": "This may be a temporary issue. Please try again."}]