Tanxshh's picture
Deploy GreenIntellect Backend API with ML models and scraping
02cc7f6
import re
from sentence_transformers import util
from .ml_models import ml_models
# Reference phrases
ENV_REF = [
"environment", "climate change", "carbon emissions", "pollution", "waste",
"green energy", "renewable resources", "sustainability", "biodiversity",
"eco-friendly", "net zero", "solar energy", "wind energy", "water conservation"
]
ESG_REF = [
"environment", "social responsibility", "governance", "sustainability", "carbon emissions",
"green energy", "renewable resources", "waste management", "climate change", "pollution control",
"biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation",
"community development", "employee welfare", "diversity", "ethics"
]
ACTION_REF = [
"implemented", "adopted", "reduced emissions", "recycled", "renewable energy",
"sustainability project", "steps taken to reduce carbon emissions",
"initiatives to help the environment", "measures to prevent greenwashing"
]
CLAIM_REF = [
"plans to achieve", "committed to", "targets", "pledges", "goal", "aims to",
"intent to reduce", "objective to be", "aims for sustainability",
"pledged to achieve", "will reduce carbon", "expect to reach net zero",
"plans to be carbon neutral by", "commitment to net zero by",
"goal to be eco friendly by", "target year for sustainability",
"striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations"
]
def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
model = ml_models.st_model
ref_emb = model.encode(reference, convert_to_tensor=True)
matches = []
# Process in batches
for i in range(0, len(sentences), batch_size):
batch = sentences[i:i+batch_size]
if not batch: continue
sent_emb = model.encode(batch, convert_to_tensor=True)
sim_matrix = util.cos_sim(sent_emb, ref_emb)
for j, sim_scores in enumerate(sim_matrix):
if sim_scores.max().item() >= threshold:
matches.append(batch[j].strip())
return matches if matches else []
def calculate_scores(sentences):
env_sentences = semantic_matches(sentences, ENV_REF)
esg_sentences = semantic_matches(sentences, ESG_REF)
action_sentences = semantic_matches(sentences, ACTION_REF)
claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54)
return {
"env_count": len(env_sentences),
"esg_count": len(esg_sentences),
"action_count": len(action_sentences),
"claim_count": len(claim_sentences),
"env_sentences": env_sentences,
"action_sentences": action_sentences
}
def calculate_vague_score(sentences):
"""
Calculate the ratio of sentences containing vague/future-tense language.
"""
vague_patterns = [
r"aim(s|ing)? to", r"plan(s|ning)? to", r"committed to", r"strive(s|ing)? for",
r"intend(s|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s|ing)? to",
r"future", r"potential", r"believe"
]
regex = re.compile("|".join(vague_patterns), re.IGNORECASE)
count = 0
for sent in sentences:
if regex.search(sent):
count += 1
return count / max(len(sentences), 1)
def calculate_concrete_score(sentences):
"""
Calculate the ratio of sentences containing specific, concrete metrics.
Looking for numbers followed by %, $, tons, kg, or years.
"""
concrete_patterns = [
r"\d+(\.\d+)?%", # Percentages
r"\$\d+", # Money
r"\d+ (tons|kg|metric tons|tonnes)", # Weight
r"by 20\d{2}", # Years (e.g. by 2030)
r"reduced by", r"achieved", r"completed" # Past tense concrete verbs
]
regex = re.compile("|".join(concrete_patterns), re.IGNORECASE)
count = 0
for sent in sentences:
if regex.search(sent):
count += 1
return count / max(len(sentences), 1)
def analyze_sentiment(text_chunks):
# Use FinBERT for sentiment
results = []
for chunk in text_chunks:
# Truncate to 1500 chars (approx 300-400 tokens) to be safe
if len(chunk) > 1500: chunk = chunk[:1500]
try:
res = ml_models.finbert(chunk, truncation=True, max_length=512)
results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}]
except Exception as e:
print(f"Sentiment error: {e}")
# Aggregate
if not results: return {"label": "Neutral", "score": 0.5}
pos = sum(1 for r in results if r['label'] == 'Positive')
neg = sum(1 for r in results if r['label'] == 'Negative')
neu = sum(1 for r in results if r['label'] == 'Neutral')
total = len(results)
if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total}
if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total}
return {"label": "Neutral", "score": neu/total}
def analyze_aspect_sentiment(text_chunks, aspect_keywords):
"""
Analyze sentiment only for chunks containing specific keywords
"""
aspect_chunks = []
for chunk in text_chunks:
if any(keyword in chunk.lower() for keyword in aspect_keywords):
aspect_chunks.append(chunk)
if not aspect_chunks:
return {"label": "Neutral", "score": 0.5}
return analyze_sentiment(aspect_chunks)