Spaces:
Running
Running
| import fitz # PyMuPDF for PDF extraction | |
| import docx # python-docx for DOCX extraction | |
| import requests | |
| import json | |
| import streamlit as st | |
| def extract_text(uploaded_file): | |
| """Extracts text from an uploaded file (PDF or DOCX).""" | |
| try: | |
| if uploaded_file.type == "application/pdf": | |
| with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: | |
| return "\n".join(page.get_text() for page in doc) | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| doc = docx.Document(uploaded_file) | |
| return "\n".join(para.text for para in doc.paragraphs) | |
| else: | |
| return "Error: Unsupported file type." | |
| except Exception as e: | |
| return f"Error processing file: {e}" | |
| def chunk_text(text, chunk_size=4000, overlap=200): | |
| """Splits text into overlapping chunks.""" | |
| if not isinstance(text, str): | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunks.append(text[start:end]) | |
| start += chunk_size - overlap | |
| return chunks | |
| def analyze_chunk(chunk): | |
| """Analyzes a single text chunk for greenwashing using the Groq API.""" | |
| try: | |
| groq_api_key = st.secrets["GROQ_API_KEY"] | |
| except FileNotFoundError: | |
| return [{"type": "Configuration Error", "excerpt": "N/A", "explanation": "GROQ_API_KEY secret not found.", "suggestion": "Please ensure the GROQ_API_KEY is set in your Streamlit secrets."}] | |
| prompt = f""" | |
| You are a critical ESG analyst specializing in identifying greenwashing. Analyze the following text for potential greenwashing. For each issue, identify the type of greenwashing (e.g., Vague Language, Irrelevant Claims, Hidden Trade-offs, No Proof), extract the specific sentence or phrase, explain why it's a potential issue, and suggest a more transparent alternative. | |
| Text to analyze: | |
| '''{chunk}''' | |
| Format your response as a valid JSON list of objects. Each object must have the keys: "type", "excerpt", "explanation", and "suggestion". If no issues are found, return an empty list []. | |
| """ | |
| api_url = "https://api.groq.com/openai/v1/chat/completions" | |
| headers = {"Authorization": f"Bearer {groq_api_key}", "Content-Type": "application/json"} | |
| payload = { | |
| "model": "llama3-8b-8192", | |
| "messages": [{"role": "user", "content": prompt}], | |
| "response_format": {"type": "json_object"}, | |
| "max_tokens": 2048, | |
| "temperature": 0.4 | |
| } | |
| try: | |
| response = requests.post(api_url, json=payload, headers=headers) | |
| response.raise_for_status() | |
| data = response.json() | |
| issues_str = data["choices"][0]["message"]["content"] | |
| # The content itself is a JSON string, so we parse it | |
| parsed_issues = json.loads(issues_str) | |
| # The actual list might be nested inside a key like 'issues' | |
| if isinstance(parsed_issues, dict) and 'issues' in parsed_issues: | |
| return parsed_issues['issues'] | |
| return parsed_issues # Assume it's the list itself | |
| except requests.exceptions.RequestException: | |
| return [{"type": "API Error", "excerpt": "N/A", "explanation": "Could not connect to the analysis service.", "suggestion": "Check your API key and network connection."}] | |
| except (json.JSONDecodeError, KeyError, IndexError): | |
| return [{"type": "API Response Error", "excerpt": "N/A", "explanation": "The analysis service returned an invalid response.", "suggestion": "This may be a temporary issue. Please try again."}] | |