AI_project / eval_generate_dataset.py
Eoin McGrath
initial commit
257dcc1
import os
import json
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.llms.gemini import Gemini
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.query_engine import RetrieverQueryEngine
from utils import create_db
from config import CHROMA_PATH, CHROMA_COLLECTION
load_dotenv()
Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
Settings.embed_model = OpenAIEmbedding(
model="text-embedding-3-small"
)
nodes = create_db(return_nodes=True)
# Free Tier-Gemini API key
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm
import json
import re
import uuid
import warnings
import time
from typing import Dict, List, Tuple
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge.
generate only questions based on the below query.
You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
def generate_question_context_pairs(
nodes: List[TextNode],
llm: LLM,
qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
num_questions_per_chunk: int = 2,
request_delay: float = 2.0
) -> EmbeddingQAFinetuneDataset:
"""Generate examples given a set of nodes with delays between requests."""
node_dict = {
node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
for node in nodes
}
queries = {}
relevant_docs = {}
for node_id, text in tqdm(node_dict.items()):
query = qa_generate_prompt_tmpl.format(
context_str=text, num_questions_per_chunk=num_questions_per_chunk
)
response = llm.complete(query)
result = str(response).strip().split("\n")
questions = [
re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
]
questions = [question for question in questions if len(question) > 0][
:num_questions_per_chunk
]
num_questions_generated = len(questions)
if num_questions_generated < num_questions_per_chunk:
warnings.warn(
f"Fewer questions generated ({num_questions_generated}) "
f"than requested ({num_questions_per_chunk})."
)
for question in questions:
question_id = str(uuid.uuid4())
queries[question_id] = question
relevant_docs[question_id] = [node_id]
time.sleep(request_delay)
return EmbeddingQAFinetuneDataset(
queries=queries, corpus=node_dict, relevant_docs=relevant_docs
)
#from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.llms.gemini import Gemini
llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)
rag_eval_dataset = generate_question_context_pairs(
nodes[:25],
llm=llm,
num_questions_per_chunk=1,
request_delay=4
)
# Save the dataset as a json file for later use
rag_eval_dataset.save_json("./rag_eval_dataset.json")