Spaces:
Configuration error
Configuration error
| import os | |
| from typing import Tuple | |
| import click | |
| import pandas as pd | |
| from datasets import Dataset | |
| from langchain.chains import LLMChain | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.prompts import PromptTemplate | |
| from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
| from langchain_google_vertexai import ChatVertexAI | |
| from loguru import logger | |
| from ragas import evaluate | |
| from ragas.embeddings import LangchainEmbeddingsWrapper | |
| from ragas.llms import LangchainLLMWrapper | |
| from ragas.metrics import ( | |
| answer_relevancy, | |
| context_precision, answer_correctness, | |
| ) | |
| from tqdm import tqdm | |
| from app.chroma import ChromaDenseVectorDB | |
| from app.config.load import load_config | |
| from app.config.models.configs import Config | |
| from app.config.models.vertexai import VertexAIModel | |
| from app.parsers.splitter import DocumentSplitter | |
| from app.pipeline import LLMBundle | |
| from app.ranking import BCEReranker | |
| from app.splade import SpladeSparseVectorDB | |
| def get_hash_mapping_filenames( | |
| config: Config, | |
| file_to_hash_fn: str = "file_hash_mappings.snappy.parquet", | |
| docid_to_hash_fn="docid_hash_mappings.snappy.parquet", | |
| ) -> Tuple[str, str]: | |
| file_hashes_fn = os.path.join(config.embeddings.embeddings_path, file_to_hash_fn) | |
| docid_hashes_fn = os.path.join(config.embeddings.embeddings_path, docid_to_hash_fn) | |
| return file_hashes_fn, docid_hashes_fn | |
| def main(): | |
| pass | |
| def create_index(app_config_path): | |
| config = load_config(app_config_path) | |
| dense_db = ChromaDenseVectorDB( | |
| persist_folder=str(config.embeddings.embeddings_path), config=config | |
| ) | |
| splitter = DocumentSplitter(config) | |
| all_docs, all_hash_filename_mappings, all_hash_docid_mappings = splitter.split() | |
| # dense embeddings | |
| dense_db.generate_embeddings(docs=all_docs) | |
| # sparse embeddings | |
| sparse_db = SpladeSparseVectorDB(config) | |
| sparse_db.generate_embeddings(docs=all_docs) | |
| file_hashes_fn, docid_hashes_fn = get_hash_mapping_filenames(config) | |
| all_hash_filename_mappings.to_parquet( | |
| file_hashes_fn, compression="snappy", index=False | |
| ) | |
| all_hash_docid_mappings.to_parquet( | |
| docid_hashes_fn, compression="snappy", index=False | |
| ) | |
| logger.info("Document Embeddings Generated") | |
| def predict_pipeline(app_config_path: str, model_config_path: str): | |
| config = load_config(app_config_path, model_config_path) | |
| # llm = OpenAIModel(config=config.llm.params) | |
| llm = VertexAIModel(config=config.llm.params) | |
| chain = load_qa_chain(llm=llm.model, prompt=llm.prompt) | |
| store = ChromaDenseVectorDB( | |
| persist_folder=str(config.embeddings.embeddings_path), config=config | |
| ) | |
| store._load_retriever() | |
| reranker = BCEReranker() | |
| chunk_sizes = config.embeddings.chunk_sizes | |
| splade = SpladeSparseVectorDB(config=config) | |
| splade.load() | |
| hyde_chain = LLMChain( | |
| llm=llm.model, | |
| prompt=PromptTemplate( | |
| template="Write a short passage to answer the question: {question}", | |
| input_variables=["question"], | |
| ), | |
| ) | |
| llm_bundle = LLMBundle( | |
| chain=chain, | |
| reranker=reranker, | |
| chunk_sizes=chunk_sizes, | |
| sparse_db=splade, | |
| dense_db=store, | |
| hyde_chain=hyde_chain, | |
| ) | |
| test_dataset = pd.read_json("evaluation_dataset.json", lines=True) | |
| evaluate_data = { | |
| "question": [], | |
| "answer": [], | |
| "contexts": [], # should be a list[list[str]] | |
| 'ground_truth': [], | |
| 'context_ground_truth': [] | |
| } | |
| test_dataset = test_dataset.head(10) | |
| for idx, row in tqdm(test_dataset.iterrows()): | |
| output = llm_bundle.get_and_parse_response( | |
| query=row["question"], | |
| config=config, | |
| ) | |
| response = output.response | |
| evaluate_data["question"].append(row["question"]) | |
| evaluate_data["answer"].append(response) | |
| evaluate_data["contexts"].append(output.semantic_search) | |
| evaluate_data["ground_truth"].append(row["answer"]) | |
| evaluate_data["context_ground_truth"].append(row["context"]) | |
| evaluate_dataset = Dataset.from_dict(evaluate_data) | |
| # store the evaluation dataset | |
| evaluate_dataset.to_pandas().to_json("evaluation_output.json", orient="records", lines=True) | |
| def evaluate_pipeline(): | |
| ragas_vertexai_llm = ChatVertexAI(model_name="gemini-pro") | |
| ragas_vertexai_llm = LangchainLLMWrapper(ragas_vertexai_llm) | |
| vertexai_embeddings = SentenceTransformerEmbeddings(model_name="maidalun1020/bce-embedding-base_v1") | |
| vertexai_embeddings = LangchainEmbeddingsWrapper(vertexai_embeddings) | |
| metrics = [ | |
| # the accuracy of the generated answer when compared to the ground truth | |
| answer_correctness, | |
| # evaluates whether all the ground-truth relevant items present in the contexts are ranked higher or not | |
| context_precision, | |
| # how pertinent the generated answer is to the given prompt | |
| answer_relevancy, | |
| ] | |
| evaluate_dataset = pd.read_json("evaluation_output.json", lines=True) | |
| evaluate_dataset = Dataset.from_pandas(evaluate_dataset) | |
| evaluate_result = evaluate( | |
| dataset=evaluate_dataset, | |
| metrics=metrics, | |
| llm=ragas_vertexai_llm, | |
| embeddings=vertexai_embeddings, | |
| is_async=True | |
| ) | |
| evaluate_result_df = evaluate_result.to_pandas() | |
| # drop the contexts, context_ground_truth | |
| evaluate_result_df = evaluate_result_df.drop(columns=["contexts", "context_ground_truth"]) | |
| # print the mean for answer_correctness context_precision answer_relevancy columns | |
| print(evaluate_result_df.mean(numeric_only=True)) | |
| evaluate_result_df.to_csv("evaluation_results.csv", index=False) | |
| if __name__ == "__main__": | |
| main() | |