champ-chatbot / patch /ecologits_patch.py
qyle's picture
Deploy from GitLab 076dca9f
f0ef107
# patch_ecologits.py
# Ecologits contains a bug where the number of input tokens is swapped with the number
# of output tokens. This:
# input_tokens = chunk.usage_metadata.candidates_token_count
# output_tokens = chunk.usage_metadata.total_token_count - output_tokens
# must be replaced by:
# output_tokens = chunk.usage_metadata.candidates_token_count
# input_tokens = chunk.usage_metadata.total_token_count - output_tokens
# The bug is repeated in mutliple parts of the Ecologits code base.
# We are fixing it here only for the Gemini code.
import time
from typing import Any, Callable
from ecologits.tracers import google_genai_tracer
from google.genai.models import Models
# Store the original function
_original_generator = google_genai_tracer._generator
async def _patched_generator(stream, timer_start, model_name):
async for chunk in stream:
if chunk.candidates[0].finish_reason is None:
yield google_genai_tracer.GenerateContentResponse(
**chunk.model_dump(), impacts=None
)
else:
request_latency = time.perf_counter() - timer_start
# The fix has been applied here
output_tokens = chunk.usage_metadata.candidates_token_count
input_tokens = chunk.usage_metadata.total_token_count - output_tokens
impacts = google_genai_tracer.llm_impacts(
provider=google_genai_tracer.PROVIDER,
model_name=model_name,
output_token_count=output_tokens,
request_latency=request_latency,
electricity_mix_zone=google_genai_tracer.EcoLogits.config.electricity_mix_zone,
)
if impacts is not None:
if google_genai_tracer.EcoLogits.config.opentelemetry:
google_genai_tracer.EcoLogits.config.opentelemetry.record_request(
input_tokens=input_tokens,
output_tokens=output_tokens,
request_latency=request_latency,
impacts=impacts,
provider=google_genai_tracer.PROVIDER,
model=model_name,
endpoint=f"/v1beta/models/{model_name}:generateContent",
)
yield google_genai_tracer.GenerateContentResponse(
**chunk.model_dump(), impacts=impacts
)
else:
yield google_genai_tracer.GenerateContentResponse(
**chunk.model_dump(), impacts=None
)
def _patched_google_genai_content_wrapper(
wrapped: Callable,
instance: Models, # noqa: ARG001
args: Any,
kwargs: Any,
):
"""
Function that wraps Google GenAI answer with computed impacts
Args:
wrapped: Callable that returns the LLM response
instance: Never used - for compatibility with `wrapt`
args: Arguments of the callable
kwargs: Keyword arguments of the callable
Returns:
A wrapped `GenerateContentResponse` with impacts
"""
timer_start = time.perf_counter()
response = wrapped(*args, **kwargs)
request_latency = time.perf_counter() - timer_start
model_name = kwargs["model"]
# The fix has been applied here
output_tokens = response.usage_metadata.candidates_token_count
input_tokens = response.usage_metadata.total_token_count - output_tokens
impacts = google_genai_tracer.llm_impacts(
provider=google_genai_tracer.PROVIDER,
model_name=model_name,
output_token_count=output_tokens,
request_latency=request_latency,
electricity_mix_zone=google_genai_tracer.EcoLogits.config.electricity_mix_zone,
)
if impacts is not None:
if google_genai_tracer.EcoLogits.config.opentelemetry:
google_genai_tracer.EcoLogits.config.opentelemetry.record_request(
input_tokens=input_tokens,
output_tokens=output_tokens,
request_latency=request_latency,
impacts=impacts,
provider=google_genai_tracer.PROVIDER,
model=model_name,
endpoint=f"/v1beta/models/{model_name}:generateContent",
)
return google_genai_tracer.GenerateContentResponse(
**response.model_dump(), impacts=impacts
)
else:
return response
# Apply the patch
google_genai_tracer._generator = _patched_generator
google_genai_tracer.google_genai_content_wrapper = _patched_google_genai_content_wrapper