Spaces:
Sleeping
Sleeping
| from nltk.tokenize import sent_tokenize | |
| import re | |
| import os | |
| import requests | |
| import time | |
| import math | |
| from typing import List, Tuple | |
| import numpy as np | |
| from sklearn.metrics import mean_squared_error, roc_auc_score | |
| from huggingface_hub import hf_hub_download, HfApi | |
| import pandas as pd | |
| from datetime import datetime | |
| from constants import HF_DATASET_REPO_NAME, HF_REPO_TYPE | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| # Ensure punkt resources are available | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| nltk.data.find('tokenizers/punkt_tab') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| def form_document_sentences_from_chunks(retrieved_chunks): | |
| """ | |
| Convert a list of retrieved chunks (each with .text) into document_sentences format. | |
| Args: | |
| retrieved_chunks: List of Chunk or Document objects with `.text` attribute. | |
| Returns: | |
| List[List[str, str]]: List of [key, sentence text] | |
| """ | |
| document_sentences = [] | |
| for doc_idx, chunk in enumerate(retrieved_chunks): | |
| sentences = sent_tokenize(chunk) | |
| for sent_idx, sent in enumerate(sentences): | |
| key = f"{doc_idx}{chr(ord('a') + sent_idx)}" | |
| document_sentences.append([key, sent.strip()]) | |
| return document_sentences | |
| def form_response_sentences(response_text): | |
| sentences = sent_tokenize(response_text) | |
| return [[f"{chr(ord('a') + idx)}.", sent.strip()] for idx, sent in enumerate(sentences)] | |
| def extract_response_text(full_text): | |
| # Look for "Answer: [/INST]" and capture everything after it | |
| match = re.search(r"Answer:\s*\[/INST\]\s*(.*)", full_text, re.DOTALL) | |
| if match: | |
| return match.group(1).strip() | |
| else: | |
| return None # or raise an error if preferred | |
| def post_with_retry(url, headers, payload, retries=3): | |
| for attempt in range(retries): | |
| response = requests.post(url, headers=headers, json=payload) | |
| if response.status_code != 200: | |
| retry_after = 30 * (attempt + 1) # default wait | |
| error_msg = response.json().get("error", {}).get("message", "") | |
| print(f"[{attempt+1}] Rate limit hit. Message: {error_msg}, sleeping for {retry_after} seconds...") | |
| time.sleep(retry_after) | |
| else: | |
| return response | |
| raise RuntimeError(f"Failed after {retries} retries: {response.text}") | |
| def get_evaluator_trace_prompt(documents, question, answer): | |
| return f""" | |
| I asked someone to answer a question based on one or more documents. | |
| Your task is to review their response and assess whether or not each sentence | |
| in that response is supported by text in the documents. And if so, which | |
| sentences in the documents provide that support. You will also tell me which | |
| of the documents contain useful information for answering the question, and | |
| which of the documents the answer was sourced from. | |
| Here are the documents, each of which is split into sentences. Alongside each | |
| sentence is associated key, such as β0a.β or β0b.β that you can use to refer | |
| to it: | |
| βββ | |
| {documents} | |
| βββ | |
| The question was: | |
| βββ | |
| {question} | |
| βββ | |
| Here is their response, split into sentences. Alongside each sentence is | |
| associated key, such as βa.β or βb.β that you can use to refer to it. Note | |
| that these keys are unique to the response, and are not related to the keys | |
| in the documents: | |
| βββ | |
| {answer} | |
| βββ | |
| You must respond with a JSON object matching this schema: | |
| βββ | |
| {{ | |
| "relevance_explanation": string, | |
| "all_relevant_sentence_keys": [string], | |
| "overall_supported_explanation": string, | |
| "overall_supported": boolean, | |
| "sentence_support_information": [ | |
| {{ | |
| "response_sentence_key": string, | |
| "explanation": string, | |
| 16 | |
| "supporting_sentence_keys": [string], | |
| "fully_supported": boolean | |
| }}, | |
| ], | |
| "all_utilized_sentence_keys": [string] | |
| }} | |
| βββ | |
| The relevance_explanation field is a string explaining which documents | |
| contain useful information for answering the question. Provide a step-by-step | |
| breakdown of information provided in the documents and how it is useful for | |
| answering the question. | |
| The all_relevant_sentence_keys field is a list of all document sentences keys | |
| (e.g. β0aβ) that are revant to the question. Include every sentence that is | |
| useful and relevant to the question, even if it was not used in the response, | |
| or if only parts of the sentence are useful. Ignore the provided response when | |
| making this judgement and base your judgement solely on the provided documents | |
| and question. Omit sentences that, if removed from the document, would not | |
| impact someoneβs ability to answer the question. | |
| The overall_supported_explanation field is a string explaining why the response | |
| *as a whole* is or is not supported by the documents. In this field, provide a | |
| step-by-step breakdown of the claims made in the response and the support (or | |
| lack thereof) for those claims in the documents. Begin by assessing each claim | |
| separately, one by one; donβt make any remarks about the response as a whole | |
| until you have assessed all the claims in isolation. | |
| The overall_supported field is a boolean indicating whether the response as a | |
| whole is supported by the documents. This value should reflect the conclusion | |
| you drew at the end of your step-by-step breakdown in overall_supported_explanation. | |
| In the sentence_support_information field, provide information about the support | |
| *for each sentence* in the response. | |
| The sentence_support_information field is a list of objects, one for each sentence | |
| in the response. Each object MUST have the following fields: | |
| - response_sentence_key: a string identifying the sentence in the response. | |
| This key is the same as the one used in the response above. | |
| - explanation: a string explaining why the sentence is or is not supported by the | |
| documents. | |
| - supporting_sentence_keys: keys (e.g. β0aβ) of sentences from the documents that | |
| support the response sentence. If the sentence is not supported, this list MUST | |
| be empty. If the sentence is supported, this list MUST contain one or more keys. | |
| In special cases where the sentence is supported, but not by any specific sentence, | |
| you can use the string "supported_without_sentence" to indicate that the sentence | |
| is generally supported by the documents. Consider cases where the sentence is | |
| expressing inability to answer the question due to lack of relevant information in | |
| the provided contex as "supported_without_sentence". In cases where the sentence | |
| is making a general statement (e.g. outlining the steps to produce an answer, or | |
| summarizing previously stated sentences, or a transition sentence), use the | |
| sting "general".In cases where the sentence is correctly stating a well-known fact, | |
| like a mathematical formula, use the string "well_known_fact". In cases where the | |
| sentence is performing numerical reasoning (e.g. addition, multiplication), use | |
| the string "numerical_reasoning". | |
| - fully_supported: a boolean indicating whether the sentence is fully supported by | |
| the documents. | |
| - This value should reflect the conclusion you drew at the end of your step-by-step | |
| breakdown in explanation. | |
| - If supporting_sentence_keys is an empty list, then fully_supported must be false. | |
| 17 | |
| - Otherwise, use fully_supported to clarify whether everything in the response | |
| sentence is fully supported by the document text indicated in supporting_sentence_keys | |
| (fully_supported = true), or whether the sentence is only partially or incompletely | |
| supported by that document text (fully_supported = false). | |
| The all_utilized_sentence_keys field is a list of all sentences keys (e.g. β0aβ) that | |
| were used to construct the answer. Include every sentence that either directly supported | |
| the answer, or was implicitly used to construct the answer, even if it was not used | |
| in its entirety. Omit sentences that were not used, and could have been removed from | |
| the documents without affecting the answer. | |
| STRICT INSTRUCTION: Your output must be strictly valid JSON matching the above schema. | |
| Do not include any extra text or comments outside the JSON. | |
| You must respond with a valid JSON string. Use escapes for quotes, e.g. β\\"β, and | |
| newlines, e.g. β\\nβ. Do not write anything before or after the JSON string. Do not | |
| wrap the JSON string in backticks like βββ or βββjson. | |
| As a reminder: your task is to review the response and assess which documents contain | |
| useful information pertaining to the question, and how each sentence in the response | |
| is supported by the text in the documents.\ | |
| """ | |
| def convert_to_serializable(obj): | |
| """Convert DataFrame and other objects to JSON-serializable formats""" | |
| if isinstance(obj, pd.DataFrame): | |
| return obj.to_dict(orient='records') | |
| elif hasattr(obj, 'to_dict'): # Handle other pandas objects | |
| return obj.to_dict() | |
| elif isinstance(obj, (datetime, pd.Timestamp)): | |
| return obj.isoformat() | |
| return str(obj) # Fallback for other types | |
| def filter_missing(y_true: List[float], y_pred: List[float]) -> Tuple[List[float], List[float]]: | |
| """Filter out (true, pred) pairs where prediction is None or NaN, and print how many are skipped""" | |
| clean_true, clean_pred = [], [] | |
| skipped = 0 | |
| for yt, yp in zip(y_true, y_pred): | |
| if yp is None: | |
| skipped += 1 | |
| continue | |
| try: | |
| if math.isnan(yp): | |
| skipped += 1 | |
| continue | |
| except Exception as e: | |
| print(f"Exception: {e}") | |
| skipped += 1 | |
| continue | |
| clean_true.append(yt) | |
| clean_pred.append(yp) | |
| if skipped > 0: | |
| print(f"Filtered {skipped} invalid predictions (None or NaN)") | |
| return clean_true, clean_pred | |
| def compute_rmse(y_true, y_pred): | |
| mse = mean_squared_error(y_true, y_pred) | |
| rmse = np.sqrt(mse) | |
| return round(rmse, 2) | |
| def compute_metric_with_missing(y_true, y_pred, metric='RMSE'): | |
| y_true_filtered, y_pred_filtered = filter_missing(y_true, y_pred) | |
| if not y_true_filtered: | |
| raise ValueError("All predictions are missing. Cannot compute metric.") | |
| if metric == 'RMSE': | |
| return compute_rmse(y_true_filtered, y_pred_filtered) | |
| elif metric == 'AUC': | |
| unique_labels = set(y_true_filtered) | |
| if len(unique_labels) < 2: | |
| print("Skipping AUC β only one class present.") | |
| return None # or float("nan") | |
| return round(roc_auc_score(y_true_filtered, y_pred_filtered),3) | |
| else: | |
| raise ValueError("Unsupported metric: use 'rmse' or 'auc'") | |
| def extract_metric_lists(data, metric_key='relevance_score'): | |
| y_true_list = [] | |
| y_pred_list = [] | |
| for dp in data: | |
| true_value = dp.get('y_metrics', {}).get(metric_key) | |
| pred_value = dp.get('y_pred_metrics', {}).get(metric_key) | |
| y_true_list.append(true_value) | |
| y_pred_list.append(pred_value) | |
| return y_true_list, y_pred_list | |
| def upload_file(filename: str, folder_path: str) -> str: | |
| """Upload a file to Hugging Face hub from the specified folder.""" | |
| try: | |
| # file_path = os.path.join(folder_path, filename) | |
| # # if not os.path.exists(file_path): | |
| # # raise FileNotFoundError(f"File {file_path} does not exist.") | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=filename, | |
| path_in_repo=f"{folder_path}/{filename}", | |
| repo_id=HF_DATASET_REPO_NAME, | |
| repo_type=HF_REPO_TYPE, | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| print(f"Uploaded {filename} to {HF_DATASET_REPO_NAME}") | |
| except Exception as e: | |
| print(f"Error uploading {filename}: {e}") | |