Spaces:
Build error
Build error
| import os | |
| import json | |
| from dotenv import load_dotenv | |
| from llama_index.core import Settings | |
| from llama_index.embeddings.openai import OpenAIEmbedding | |
| from llama_index.llms.openai import OpenAI | |
| from llama_index.llms.gemini import Gemini | |
| from llama_index.core.evaluation import generate_question_context_pairs | |
| from llama_index.core.evaluation import RetrieverEvaluator | |
| from llama_index.core.query_engine import RetrieverQueryEngine | |
| from utils import create_db | |
| from config import CHROMA_PATH, CHROMA_COLLECTION | |
| load_dotenv() | |
| Settings.llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512) | |
| Settings.embed_model = OpenAIEmbedding( | |
| model="text-embedding-3-small" | |
| ) | |
| nodes = create_db(return_nodes=True) | |
| # Free Tier-Gemini API key | |
| from llama_index.core.llms.utils import LLM | |
| from llama_index.core.schema import MetadataMode, TextNode | |
| from tqdm import tqdm | |
| import json | |
| import re | |
| import uuid | |
| import warnings | |
| import time | |
| from typing import Dict, List, Tuple | |
| from llama_index.core.evaluation import EmbeddingQAFinetuneDataset | |
| DEFAULT_QA_GENERATE_PROMPT_TMPL = """\ | |
| Context information is below. | |
| --------------------- | |
| {context_str} | |
| --------------------- | |
| Given the context information and not prior knowledge. | |
| generate only questions based on the below query. | |
| You are a Teacher/ Professor. Your task is to setup \ | |
| {num_questions_per_chunk} questions for an upcoming \ | |
| quiz/examination. The questions should be diverse in nature \ | |
| across the document. Restrict the questions to the \ | |
| context information provided." | |
| """ | |
| def generate_question_context_pairs( | |
| nodes: List[TextNode], | |
| llm: LLM, | |
| qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL, | |
| num_questions_per_chunk: int = 2, | |
| request_delay: float = 2.0 | |
| ) -> EmbeddingQAFinetuneDataset: | |
| """Generate examples given a set of nodes with delays between requests.""" | |
| node_dict = { | |
| node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) | |
| for node in nodes | |
| } | |
| queries = {} | |
| relevant_docs = {} | |
| for node_id, text in tqdm(node_dict.items()): | |
| query = qa_generate_prompt_tmpl.format( | |
| context_str=text, num_questions_per_chunk=num_questions_per_chunk | |
| ) | |
| response = llm.complete(query) | |
| result = str(response).strip().split("\n") | |
| questions = [ | |
| re.sub(r"^\d+[\).\s]", "", question).strip() for question in result | |
| ] | |
| questions = [question for question in questions if len(question) > 0][ | |
| :num_questions_per_chunk | |
| ] | |
| num_questions_generated = len(questions) | |
| if num_questions_generated < num_questions_per_chunk: | |
| warnings.warn( | |
| f"Fewer questions generated ({num_questions_generated}) " | |
| f"than requested ({num_questions_per_chunk})." | |
| ) | |
| for question in questions: | |
| question_id = str(uuid.uuid4()) | |
| queries[question_id] = question | |
| relevant_docs[question_id] = [node_id] | |
| time.sleep(request_delay) | |
| return EmbeddingQAFinetuneDataset( | |
| queries=queries, corpus=node_dict, relevant_docs=relevant_docs | |
| ) | |
| #from llama_index.core.evaluation import generate_question_context_pairs | |
| from llama_index.llms.gemini import Gemini | |
| llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512) | |
| rag_eval_dataset = generate_question_context_pairs( | |
| nodes[:25], | |
| llm=llm, | |
| num_questions_per_chunk=1, | |
| request_delay=4 | |
| ) | |
| # Save the dataset as a json file for later use | |
| rag_eval_dataset.save_json("./rag_eval_dataset.json") | |