Spaces:

Koshti10
/

TestLLMCalc

Sleeping

App Files Files Community

TestLLMCalc / src /collect_data.py

Koshti10

Upload 11 files

ef818ff verified about 1 year ago

raw

history blame contribute delete

5.71 kB

	"""
	Collect data from the multiple sources and create a base datafranme for the LLMCalculator table
	Latency - https://github.com/clembench/clembench-runs/tree/main/Addenda/Latency
	Pricing - pricing.json
	Model info - https://github.com/kushal-10/clembench/blob/feat/registry/backends/model_registry_updated.json
	"""

	import pandas as pd
	import json
	import requests
	from assets.text_content import CLEMBENCH_RUNS_REPO, REGISTRY_URL, BENCHMARK_FILE, LATENCY_FOLDER, RESULT_FILE, LATENCY_SUFFIX
	import os

	def validate_request(url: str, response) -> bool:
	"""
	Validate if an HTTP request was successful.

	Args:
	url (str): The URL that was requested
	response (requests.Response): The response object from the request

	Returns:
	bool: True if request was successful (status code 200), False otherwise
	"""

	if response.status_code != 200:
	print(f"Failed to read file - {url}. Status Code: {response.status_code}")
	return False
	return True

	def fetch_benchmark_data(benchmark: str = "text", version_names: list = []) -> tuple:
	"""
	Fetch and parse benchmark results and latency data from CSV files.

	Args:
	benchmark (str): Type of benchmark to fetch ('text' or 'multimodal')
	version_names (list): List of version names to search through, sorted by latest first

	Returns:
	tuple[pd.DataFrame, pd.DataFrame]: A tuple containing:
	- results_df: DataFrame with benchmark results
	- latency_df: DataFrame with latency measurements
	Returns (None, None) if no matching version is found or requests fail

	Raises:
	requests.RequestException: If there's an error fetching the data
	pd.errors.EmptyDataError: If CSV file is empty
	pd.errors.ParserError: If CSV parsing fails
	"""
	for v in version_names:
	# Check if version matches benchmark type
	is_multimodal = 'multimodal' in v
	if (benchmark == "multimodal") != is_multimodal:
	continue

	# Construct URLs
	results_url = os.path.join(CLEMBENCH_RUNS_REPO, v, RESULT_FILE)
	latency_url = os.path.join(CLEMBENCH_RUNS_REPO, LATENCY_FOLDER, v + LATENCY_SUFFIX)

	try:
	results = requests.get(results_url)
	latency = requests.get(latency_url)

	if validate_request(results_url, results) and validate_request(latency_url, latency):
	# Convert the CSV content to pandas DataFrames
	results_df = pd.read_csv(pd.io.common.StringIO(results.text))
	latency_df = pd.read_csv(pd.io.common.StringIO(latency.text))
	return results_df, latency_df

	except requests.RequestException as e:
	print(f"Error fetching data for version {v}: {e}")
	except pd.errors.EmptyDataError:
	print(f"Error: Empty CSV file found for version {v}")
	except pd.errors.ParserError:
	print(f"Error: Unable to parse CSV data for version {v}")

	return None, None

	def fetch_version_metadata() -> tuple:
	"""
	Fetch and process benchmark metadata from the Clembench GitHub repository.

	The data is sourced from: https://github.com/clembench/clembench-runs
	Configure the repository path in src/assets/text_content/CLEMBENCH_RUNS_REPO

	Returns:
	tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
	- mm_result: Multimodal benchmark results
	- mm_latency: Multimodal latency data
	- text_result: Text benchmark results
	- text_latency: Text latency data
	Returns (None, None, None, None) if the request fails
	"""
	json_url = CLEMBENCH_RUNS_REPO + BENCHMARK_FILE
	response = requests.get(json_url)

	# Check if the JSON file request was successful
	if not validate_request(json_url, response):
	return None, None, None, None

	json_data = response.json()
	versions = json_data['versions']

	# Sort the versions in benchmark by latest first
	version_names = sorted(
	[ver['version'] for ver in versions],
	key=lambda v: list(map(int, v[1:].split('_')[0].split('.'))),
	reverse=True
	)

	# Latency is in seconds
	mm_result, mm_latency = fetch_benchmark_data("multimodal", version_names)
	text_result, text_latency = fetch_benchmark_data("text", version_names)

	return mm_latency, mm_result, text_latency, text_result

	def fetch_registry_data() -> dict:
	"""
	Fetch and parse model registry data from the Clembench registry URL.

	The data is sourced from the model registry defined in REGISTRY_URL.
	Contains information about various LLM models including their specifications
	and capabilities.

	Returns:
	dict: Dictionary containing model registry data.
	Returns None if the request fails or the JSON is invalid.

	Raises:
	requests.RequestException: If there's an error fetching the data
	json.JSONDecodeError: If the response cannot be parsed as JSON
	"""
	try:
	response = requests.get(REGISTRY_URL)
	if not validate_request(REGISTRY_URL, response):
	return None

	return response.json()

	except requests.RequestException as e:
	print(f"Error fetching registry data: {e}")
	except json.JSONDecodeError as e:
	print(f"Error parsing registry JSON: {e}")

	return None

	if __name__=="__main__":
	fetch_version_metadata()
	registry_data = fetch_registry_data()
	print(registry_data[0])