Spaces:

GXSNetwork
/

PTBV6in1

Running

App Files Files Community

PTBV6in1 / agents /greenwash_utils.py

gaialive

Upload 9 files

7d37b33 verified 8 months ago

raw

history blame contribute delete

3.65 kB

	import fitz # PyMuPDF for PDF extraction
	import docx # python-docx for DOCX extraction
	import requests
	import json
	import streamlit as st

	def extract_text(uploaded_file):
	"""Extracts text from an uploaded file (PDF or DOCX)."""
	try:
	if uploaded_file.type == "application/pdf":
	with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
	return "\n".join(page.get_text() for page in doc)
	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	doc = docx.Document(uploaded_file)
	return "\n".join(para.text for para in doc.paragraphs)
	else:
	return "Error: Unsupported file type."
	except Exception as e:
	return f"Error processing file: {e}"


	def chunk_text(text, chunk_size=4000, overlap=200):
	"""Splits text into overlapping chunks."""
	if not isinstance(text, str):
	return []
	chunks = []
	start = 0
	while start < len(text):
	end = start + chunk_size
	chunks.append(text[start:end])
	start += chunk_size - overlap
	return chunks

	def analyze_chunk(chunk):
	"""Analyzes a single text chunk for greenwashing using the Groq API."""
	try:
	groq_api_key = st.secrets["GROQ_API_KEY"]
	except FileNotFoundError:
	return [{"type": "Configuration Error", "excerpt": "N/A", "explanation": "GROQ_API_KEY secret not found.", "suggestion": "Please ensure the GROQ_API_KEY is set in your Streamlit secrets."}]

	prompt = f"""
	You are a critical ESG analyst specializing in identifying greenwashing. Analyze the following text for potential greenwashing. For each issue, identify the type of greenwashing (e.g., Vague Language, Irrelevant Claims, Hidden Trade-offs, No Proof), extract the specific sentence or phrase, explain why it's a potential issue, and suggest a more transparent alternative.

	Text to analyze:
	'''{chunk}'''

	Format your response as a valid JSON list of objects. Each object must have the keys: "type", "excerpt", "explanation", and "suggestion". If no issues are found, return an empty list [].
	"""
	api_url = "https://api.groq.com/openai/v1/chat/completions"
	headers = {"Authorization": f"Bearer {groq_api_key}", "Content-Type": "application/json"}
	payload = {
	"model": "llama3-8b-8192",
	"messages": [{"role": "user", "content": prompt}],
	"response_format": {"type": "json_object"},
	"max_tokens": 2048,
	"temperature": 0.4
	}
	try:
	response = requests.post(api_url, json=payload, headers=headers)
	response.raise_for_status()
	data = response.json()
	issues_str = data["choices"][0]["message"]["content"]
	# The content itself is a JSON string, so we parse it
	parsed_issues = json.loads(issues_str)
	# The actual list might be nested inside a key like 'issues'
	if isinstance(parsed_issues, dict) and 'issues' in parsed_issues:
	return parsed_issues['issues']
	return parsed_issues # Assume it's the list itself

	except requests.exceptions.RequestException:
	return [{"type": "API Error", "excerpt": "N/A", "explanation": "Could not connect to the analysis service.", "suggestion": "Check your API key and network connection."}]
	except (json.JSONDecodeError, KeyError, IndexError):
	return [{"type": "API Response Error", "excerpt": "N/A", "explanation": "The analysis service returned an invalid response.", "suggestion": "This may be a temporary issue. Please try again."}]