Spaces:
Sleeping
Sleeping
| import re | |
| from sentence_transformers import util | |
| from .ml_models import ml_models | |
| # Reference phrases | |
| ENV_REF = [ | |
| "environment", "climate change", "carbon emissions", "pollution", "waste", | |
| "green energy", "renewable resources", "sustainability", "biodiversity", | |
| "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation" | |
| ] | |
| ESG_REF = [ | |
| "environment", "social responsibility", "governance", "sustainability", "carbon emissions", | |
| "green energy", "renewable resources", "waste management", "climate change", "pollution control", | |
| "biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation", | |
| "community development", "employee welfare", "diversity", "ethics" | |
| ] | |
| ACTION_REF = [ | |
| "implemented", "adopted", "reduced emissions", "recycled", "renewable energy", | |
| "sustainability project", "steps taken to reduce carbon emissions", | |
| "initiatives to help the environment", "measures to prevent greenwashing" | |
| ] | |
| CLAIM_REF = [ | |
| "plans to achieve", "committed to", "targets", "pledges", "goal", "aims to", | |
| "intent to reduce", "objective to be", "aims for sustainability", | |
| "pledged to achieve", "will reduce carbon", "expect to reach net zero", | |
| "plans to be carbon neutral by", "commitment to net zero by", | |
| "goal to be eco friendly by", "target year for sustainability", | |
| "striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations" | |
| ] | |
| def semantic_matches(sentences, reference, threshold=0.55, batch_size=64): | |
| model = ml_models.st_model | |
| ref_emb = model.encode(reference, convert_to_tensor=True) | |
| matches = [] | |
| # Process in batches | |
| for i in range(0, len(sentences), batch_size): | |
| batch = sentences[i:i+batch_size] | |
| if not batch: continue | |
| sent_emb = model.encode(batch, convert_to_tensor=True) | |
| sim_matrix = util.cos_sim(sent_emb, ref_emb) | |
| for j, sim_scores in enumerate(sim_matrix): | |
| if sim_scores.max().item() >= threshold: | |
| matches.append(batch[j].strip()) | |
| return matches if matches else [] | |
| def calculate_scores(sentences): | |
| env_sentences = semantic_matches(sentences, ENV_REF) | |
| esg_sentences = semantic_matches(sentences, ESG_REF) | |
| action_sentences = semantic_matches(sentences, ACTION_REF) | |
| claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54) | |
| return { | |
| "env_count": len(env_sentences), | |
| "esg_count": len(esg_sentences), | |
| "action_count": len(action_sentences), | |
| "claim_count": len(claim_sentences), | |
| "env_sentences": env_sentences, | |
| "action_sentences": action_sentences | |
| } | |
| def calculate_vague_score(sentences): | |
| """ | |
| Calculate the ratio of sentences containing vague/future-tense language. | |
| """ | |
| vague_patterns = [ | |
| r"aim(s|ing)? to", r"plan(s|ning)? to", r"committed to", r"strive(s|ing)? for", | |
| r"intend(s|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s|ing)? to", | |
| r"future", r"potential", r"believe" | |
| ] | |
| regex = re.compile("|".join(vague_patterns), re.IGNORECASE) | |
| count = 0 | |
| for sent in sentences: | |
| if regex.search(sent): | |
| count += 1 | |
| return count / max(len(sentences), 1) | |
| def calculate_concrete_score(sentences): | |
| """ | |
| Calculate the ratio of sentences containing specific, concrete metrics. | |
| Looking for numbers followed by %, $, tons, kg, or years. | |
| """ | |
| concrete_patterns = [ | |
| r"\d+(\.\d+)?%", # Percentages | |
| r"\$\d+", # Money | |
| r"\d+ (tons|kg|metric tons|tonnes)", # Weight | |
| r"by 20\d{2}", # Years (e.g. by 2030) | |
| r"reduced by", r"achieved", r"completed" # Past tense concrete verbs | |
| ] | |
| regex = re.compile("|".join(concrete_patterns), re.IGNORECASE) | |
| count = 0 | |
| for sent in sentences: | |
| if regex.search(sent): | |
| count += 1 | |
| return count / max(len(sentences), 1) | |
| def analyze_sentiment(text_chunks): | |
| # Use FinBERT for sentiment | |
| results = [] | |
| for chunk in text_chunks: | |
| # Truncate to 1500 chars (approx 300-400 tokens) to be safe | |
| if len(chunk) > 1500: chunk = chunk[:1500] | |
| try: | |
| res = ml_models.finbert(chunk, truncation=True, max_length=512) | |
| results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}] | |
| except Exception as e: | |
| print(f"Sentiment error: {e}") | |
| # Aggregate | |
| if not results: return {"label": "Neutral", "score": 0.5} | |
| pos = sum(1 for r in results if r['label'] == 'Positive') | |
| neg = sum(1 for r in results if r['label'] == 'Negative') | |
| neu = sum(1 for r in results if r['label'] == 'Neutral') | |
| total = len(results) | |
| if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total} | |
| if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total} | |
| return {"label": "Neutral", "score": neu/total} | |
| def analyze_aspect_sentiment(text_chunks, aspect_keywords): | |
| """ | |
| Analyze sentiment only for chunks containing specific keywords | |
| """ | |
| aspect_chunks = [] | |
| for chunk in text_chunks: | |
| if any(keyword in chunk.lower() for keyword in aspect_keywords): | |
| aspect_chunks.append(chunk) | |
| if not aspect_chunks: | |
| return {"label": "Neutral", "score": 0.5} | |
| return analyze_sentiment(aspect_chunks) | |