presentation-search / src /config /spreadsheets.py
Ilia Tambovtsev
feat: implement reffering to sheet by name
0c02234
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Union
import pandas as pd
from dotenv import load_dotenv
from gspread_pandas import Client, Spread
from gspread_pandas.conf import get_config
from src.config import Config
logger = logging.getLogger(__name__)
def load_spreadsheet(
sheet_id: Optional[str] = None,
gid: Optional[Union[str, int]] = None
) -> pd.DataFrame:
"""Load data from Google Spreadsheet.
Args:
sheet_id: Spreadsheet ID. If None, loads from BENCHMARK_SPREADSHEET_ID env var
gid: Sheet identifier. Can be either:
- Sheet ID (numeric)
- Sheet name (string)
If None, loads the first sheet
Returns:
DataFrame with loaded data
"""
if sheet_id is None:
load_dotenv()
sheet_id = os.environ.get("BENCHMARK_SPREADSHEET_ID")
if not sheet_id:
raise ValueError("No spreadsheet ID provided")
logger.info(f"Loading questions from spreadsheet ({sheet_id[:15]}...)/{gid}")
# Check if gid is numeric (sheet ID) or string (sheet name)
if gid is None or str(gid).isdigit():
# Use CSV export URL for numeric gid
csv_load_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv"
if gid is not None:
csv_load_url = f"{csv_load_url}&gid={gid}"
df = pd.read_csv(csv_load_url)
else:
# Load by sheet name using gspread_pandas
google_config_dir = Config().navigator.root
google_config = get_config(google_config_dir, "google_config.json")
client = Client(config=google_config)
spread = Spread(sheet_id, client=client)
df = spread.sheet_to_df(sheet=str(gid), index=False)
return df
class GoogleSpreadsheetManager:
def __init__(
self,
spread_id: str,
google_config_dir: Optional[Path] = Config().navigator.root,
google_config_fname: str = "google_config.json",
benchmark_spreadsheet_id: Optional[str] = None,
eval_spreadsheet_id: Optional[str] = None,
):
self.spread_id = spread_id
google_config = get_config(google_config_dir, google_config_fname)
self.google_client = Client(config=google_config)
# if benchmark_spreadsheet_id is None or eval_spreadsheet_id is None:
# load_dotenv()
# benchmark_spreadsheet_id = os.getenv("BENCHMARK_SPREADSHEET_ID")
# eval_spreadsheet_id = os.getenv("EVAL_SPREADSHEET_ID")
#
# self.benchmark_spreadsheet_id = benchmark_spreadsheet_id
# self.eval_spreadsheet_id = self.eval_spreadsheet_id
def write_spreadsheet(
self, df: pd.DataFrame, sheet_id: str, sheet_name: str, start: str
) -> None:
spread = Spread(sheet_id, config=self.google_client)
spread.df_to_sheet(df, index=False, sheet=sheet_name, start=start, replace=True)
def get_spread(self) -> Spread:
return Spread(
self.spread_id,
client=self.google_client,
create_spread=True,
create_sheet=True,
)
class GoogleSpreadsheetManagerMLFlow:
"""Manages Google Spreadsheet operations for evaluation results"""
def __init__(
self,
spread_id: str,
google_config_dir: Optional[Path] = Config().navigator.root,
google_config_fname: str = "google_config.json",
):
"""Initialize spreadsheet manager
Args:
spread_id: ID of the target spreadsheet
google_config_dir: Directory containing Google credentials
google_config_fname: Name of the Google credentials file
"""
self.spread_id = spread_id
self._config = get_config(google_config_dir, google_config_fname)
self._client = Client(config=self._config)
self._spread = self.get_spread()
# Define standard sheet names and layouts
self._summary_sheet = "Evaluation Summary"
self._details_sheet = "Detailed Results"
self._metrics_sheet = "Metrics History"
# Initialize standard sheets if they don't exist
self._init_sheets()
def _init_sheets(self) -> None:
"""Initialize standard sheets with headers if they don't exist"""
# Check and create summary sheet
if self._summary_sheet not in self._spread.sheets:
summary_headers = [
"Timestamp",
"Experiment",
"Retriever",
"Scorer",
"Questions Count",
"Mean Metrics",
]
self._create_sheet(self._summary_sheet, summary_headers)
# Check and create details sheet
if self._details_sheet not in self._spread.sheets:
details_headers = [
"Timestamp",
"Experiment",
"Retriever",
"Scorer",
"Question",
"Expected Presentation",
"Expected Pages",
"Retrieved Presentations",
"Retrieved Pages",
"Metric Scores",
"Metric Explanations",
]
self._create_sheet(self._details_sheet, details_headers)
# Check and create metrics history sheet
if self._metrics_sheet not in self._spread.sheets:
metrics_headers = [
"Timestamp",
"Experiment",
"Retriever",
"Scorer",
"Metric Name",
"Mean Score",
]
self._create_sheet(self._metrics_sheet, metrics_headers)
def _create_sheet(self, sheet_name: str, headers: List[str]) -> None:
"""Create new sheet with headers
Args:
sheet_name: Name for the new sheet
headers: List of column headers
"""
try:
spread = self._spread.find_sheet(sheet_name)
if spread:
logger.info(f"Using existing sheet '{sheet_name}'")
else:
self._spread.create_sheet(sheet_name)
worksheet = self._spread.find_sheet(sheet_name)
if worksheet:
worksheet.update([headers])
logger.info(f"Created sheet '{sheet_name}' with headers")
except Exception as e:
logger.error(f"Failed to create sheet '{sheet_name}': {str(e)}")
raise
def get_spread(self) -> Spread:
"""Get Spread instance for the target spreadsheet"""
return Spread(
self.spread_id,
client=self._client,
create_spread=True,
create_sheet=True,
)
def write_evaluation_results(
self,
results_df: pd.DataFrame,
metric_values: Dict[str, List[float]],
experiment_name: str,
) -> None:
"""Write evaluation results to spreadsheet
Args:
results_df: DataFrame with detailed evaluation results
metric_values: Dictionary mapping metric names to score lists
experiment_name: Name of the experiment
"""
try:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
retriever = results_df["retriever"].iloc[0]
scorer = results_df["scorer"].iloc[0]
# Write summary
summary_row = {
"Timestamp": timestamp,
"Experiment": experiment_name,
"Retriever": retriever,
"Scorer": scorer,
"Questions Count": len(results_df),
"Mean Metrics": ", ".join(
f"{name}: {sum(values)/len(values):.3f}"
for name, values in metric_values.items()
),
}
self._append_rows(self._summary_sheet, [summary_row])
# Write detailed results
details = []
for _, row in results_df.iterrows():
detail_row = {
"Timestamp": timestamp,
"Experiment": experiment_name,
"Retriever": retriever,
"Scorer": scorer,
"Question": row["question"],
"Expected Presentation": row["expected_presentation"],
"Expected Pages": row["expected_pages"],
"Retrieved Presentations": row["retrieved_presentations"],
"Retrieved Pages": row["retrieved_pages"],
"Metric Scores": ", ".join(
f"{col.replace('metric_', '').replace('_score', '')}: {row[col]}"
for col in results_df.columns
if col.endswith("_score")
),
"Metric Explanations": "\n".join(
f"{col.replace('metric_', '').replace('_explanation', '')}: {row[col]}"
for col in results_df.columns
if col.endswith("_explanation")
),
}
details.append(detail_row)
self._append_rows(self._details_sheet, details)
# Write metrics history
metrics = []
for metric_name, values in metric_values.items():
metrics.append(
{
"Timestamp": timestamp,
"Experiment": experiment_name,
"Retriever": retriever,
"Scorer": scorer,
"Metric Name": metric_name,
"Mean Score": sum(values) / len(values),
}
)
self._append_rows(self._metrics_sheet, metrics)
logger.info(
f"Successfully wrote evaluation results to sheets: "
f"{self._summary_sheet}, {self._details_sheet}, {self._metrics_sheet}"
)
except Exception as e:
logger.error(f"Failed to write evaluation results: {str(e)}")
raise
def _append_rows(self, sheet_name: str, rows: List[Dict]) -> None:
"""Append rows to specified sheet
Args:
sheet_name: Target sheet name
rows: List of dictionaries representing rows
"""
try:
df = pd.DataFrame(rows)
worksheet = self._spread.find_sheet(sheet_name)
if worksheet:
start = f"A{len(worksheet.get_all_values()) + 1}"
self.write_spreadsheet(df, self.spread_id, sheet_name, start)
except Exception as e:
logger.error(f"Failed to append rows to '{sheet_name}': {str(e)}")
raise
def write_spreadsheet(
self, df: pd.DataFrame, sheet_id: str, sheet_name: str, start: str
) -> None:
"""Write DataFrame to spreadsheet at specified location
Args:
df: DataFrame to write
sheet_id: Target spreadsheet ID
sheet_name: Target sheet name
start: Starting cell (e.g. 'A1')
"""
spread = Spread(sheet_id, client=self._client)
spread.df_to_sheet(
df, index=False, headers=False, sheet=sheet_name, start=start, replace=False
)