Spaces:

Tanxshh
/

greenintellect

Sleeping

App Files Files Community

greenintellect / app /services /scoring.py

Tanxshh

Deploy GreenIntellect Backend API with ML models and scraping

02cc7f6 about 1 month ago

raw

history blame contribute delete

5.47 kB

	import re
	from sentence_transformers import util
	from .ml_models import ml_models

	# Reference phrases
	ENV_REF = [
	"environment", "climate change", "carbon emissions", "pollution", "waste",
	"green energy", "renewable resources", "sustainability", "biodiversity",
	"eco-friendly", "net zero", "solar energy", "wind energy", "water conservation"
	]
	ESG_REF = [
	"environment", "social responsibility", "governance", "sustainability", "carbon emissions",
	"green energy", "renewable resources", "waste management", "climate change", "pollution control",
	"biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation",
	"community development", "employee welfare", "diversity", "ethics"
	]
	ACTION_REF = [
	"implemented", "adopted", "reduced emissions", "recycled", "renewable energy",
	"sustainability project", "steps taken to reduce carbon emissions",
	"initiatives to help the environment", "measures to prevent greenwashing"
	]
	CLAIM_REF = [
	"plans to achieve", "committed to", "targets", "pledges", "goal", "aims to",
	"intent to reduce", "objective to be", "aims for sustainability",
	"pledged to achieve", "will reduce carbon", "expect to reach net zero",
	"plans to be carbon neutral by", "commitment to net zero by",
	"goal to be eco friendly by", "target year for sustainability",
	"striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations"
	]

	def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
	model = ml_models.st_model
	ref_emb = model.encode(reference, convert_to_tensor=True)
	matches = []

	# Process in batches
	for i in range(0, len(sentences), batch_size):
	batch = sentences[i:i+batch_size]
	if not batch: continue
	sent_emb = model.encode(batch, convert_to_tensor=True)
	sim_matrix = util.cos_sim(sent_emb, ref_emb)

	for j, sim_scores in enumerate(sim_matrix):
	if sim_scores.max().item() >= threshold:
	matches.append(batch[j].strip())

	return matches if matches else []

	def calculate_scores(sentences):
	env_sentences = semantic_matches(sentences, ENV_REF)
	esg_sentences = semantic_matches(sentences, ESG_REF)
	action_sentences = semantic_matches(sentences, ACTION_REF)
	claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54)

	return {
	"env_count": len(env_sentences),
	"esg_count": len(esg_sentences),
	"action_count": len(action_sentences),
	"claim_count": len(claim_sentences),
	"env_sentences": env_sentences,
	"action_sentences": action_sentences
	}

	def calculate_vague_score(sentences):
	"""
	Calculate the ratio of sentences containing vague/future-tense language.
	"""
	vague_patterns = [
	r"aim(s\|ing)? to", r"plan(s\|ning)? to", r"committed to", r"strive(s\|ing)? for",
	r"intend(s\|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s\|ing)? to",
	r"future", r"potential", r"believe"
	]
	regex = re.compile("\|".join(vague_patterns), re.IGNORECASE)

	count = 0
	for sent in sentences:
	if regex.search(sent):
	count += 1

	return count / max(len(sentences), 1)

	def calculate_concrete_score(sentences):
	"""
	Calculate the ratio of sentences containing specific, concrete metrics.
	Looking for numbers followed by %, $, tons, kg, or years.
	"""
	concrete_patterns = [
	r"\d+(\.\d+)?%", # Percentages
	r"\$\d+", # Money
	r"\d+ (tons\|kg\|metric tons\|tonnes)", # Weight
	r"by 20\d{2}", # Years (e.g. by 2030)
	r"reduced by", r"achieved", r"completed" # Past tense concrete verbs
	]
	regex = re.compile("\|".join(concrete_patterns), re.IGNORECASE)

	count = 0
	for sent in sentences:
	if regex.search(sent):
	count += 1

	return count / max(len(sentences), 1)

	def analyze_sentiment(text_chunks):
	# Use FinBERT for sentiment
	results = []
	for chunk in text_chunks:
	# Truncate to 1500 chars (approx 300-400 tokens) to be safe
	if len(chunk) > 1500: chunk = chunk[:1500]
	try:
	res = ml_models.finbert(chunk, truncation=True, max_length=512)
	results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}]
	except Exception as e:
	print(f"Sentiment error: {e}")

	# Aggregate
	if not results: return {"label": "Neutral", "score": 0.5}

	pos = sum(1 for r in results if r['label'] == 'Positive')
	neg = sum(1 for r in results if r['label'] == 'Negative')
	neu = sum(1 for r in results if r['label'] == 'Neutral')

	total = len(results)
	if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total}
	if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total}
	return {"label": "Neutral", "score": neu/total}

	def analyze_aspect_sentiment(text_chunks, aspect_keywords):
	"""
	Analyze sentiment only for chunks containing specific keywords
	"""
	aspect_chunks = []
	for chunk in text_chunks:
	if any(keyword in chunk.lower() for keyword in aspect_keywords):
	aspect_chunks.append(chunk)

	if not aspect_chunks:
	return {"label": "Neutral", "score": 0.5}

	return analyze_sentiment(aspect_chunks)