""" Testset Generation module for the RAG system using Ragas. This script generates question-answer pairs from documents to be used in evaluation. how to run: python generate_testset.py ... """ # pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,too-many-return-statements,ungrouped-imports,line-too-long,logging-fstring-interpolation,duplicate-code,too-few-public-methods import os import sys import logging from typing import List, Any from PyPDF2 import PdfReader try: # Newer langchain versions expose Document in langchain.schema from langchain.schema import Document except Exception: try: # Older versions used langchain.docstore.document from langchain.docstore.document import Document except Exception: # Minimal fallback Document for environments without langchain from dataclasses import dataclass @dataclass class Document: page_content: str metadata: dict | None = None from ragas.testset.synthesizers.generate import TestsetGenerator try: from langchain.chat_models import ChatOpenAI except Exception: from langchain_openai import ChatOpenAI try: from langchain_huggingface import HuggingFaceEmbeddings except Exception: from langchain_community.embeddings import HuggingFaceEmbeddings try: from langchain.schema import SystemMessage, HumanMessage except Exception: # Minimal stand-ins if langchain.schema isn't available from dataclasses import dataclass @dataclass class SystemMessage: content: str @dataclass class HumanMessage: content: str def _extract_chat_response(resp) -> str: """Robust extraction of text from various ChatOpenAI response shapes.""" try: # langchain newer: AIMessage with .content if hasattr(resp, "content"): return resp.content # langchain older/other: ChatResult with .generations if hasattr(resp, "generations"): gens = resp.generations # gens may be list[list[Generation]] or list[Generation] try: return gens[0][0].text except Exception: try: return gens[0].text except Exception: pass # fallback dict/list shapes if isinstance(resp, list) and resp: first = resp[0] if hasattr(first, "content"): return first.content if isinstance(first, dict) and "content" in first: return first["content"] if isinstance(resp, dict): for k in ("content", "text"): if k in resp: return resp[k] except Exception: pass return str(resp) def summarize_documents(docs, llm, max_summary_chars: int = 2000) -> List[Document]: """Summarize each Document using the provided LLM into shorter Documents. This is optional and controlled by the `USE_CHUNK_SUMMARIZATION` env var. """ summaries: List[Document] = [] for i, doc in enumerate(docs): text = (doc.page_content or "").strip() if not text: continue # Construct a concise summarization prompt prompt = ( f"Summarize the following text into a concise summary (preserve key facts, numbers, and named entities). " f"Aim for no more than {max_summary_chars} characters. Return only the summary, no commentary.\n\nText:\n" + text ) try: messages = [ SystemMessage(content="You are a concise summarizer."), HumanMessage(content=prompt), ] resp = llm(messages) summary = _extract_chat_response(resp) except Exception: try: resp = llm(prompt) summary = _extract_chat_response(resp) except Exception as e: logging.debug(f"Summarization failed for chunk {i}: {e}") # Fallback: truncate summary = text[:max_summary_chars] summary = (summary or "").strip() if not summary: summary = text[:max_summary_chars] meta = dict(doc.metadata) if getattr(doc, "metadata", None) else {} meta.update({"chunk": i}) summaries.append(Document(page_content=summary, metadata=meta)) return summaries # Text splitting to avoid sending huge prompts to the LLM try: from langchain.text_splitter import RecursiveCharacterTextSplitter except Exception: # Minimal fallback splitter if langchain isn't available class RecursiveCharacterTextSplitter: def __init__(self, chunk_size: int = 8000, chunk_overlap: int = 500): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def split_documents(self, docs): out = [] for doc in docs: text = doc.page_content or "" step = max(1, self.chunk_size - self.chunk_overlap) for i in range(0, len(text), step): chunk = text[i : i + self.chunk_size] out.append(Document(page_content=chunk, metadata=doc.metadata)) return out def get_documents_from_pdfs(pdf_paths: List[str]) -> List[Document]: """ Load PDFs and convert them to LangChain Document objects. Parameters ---------- pdf_paths : List[str] List of paths to PDF files. Returns ------- List[Document] List of LangChain Document objects. """ documents = [] for path in pdf_paths: try: reader = PdfReader(path) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text source = os.path.basename(path) documents.append(Document(page_content=text, metadata={"source": source})) except Exception as e: logging.error(f"Error reading {path}: {e}") return documents def generate_testset( pdf_paths: List[str], test_size: int = 10, output_path: str = "testset.csv" ) -> Any: """ Generate a test set from the given PDFs. Parameters ---------- pdf_paths : List[str] List of paths to PDF files. test_size : int, optional Number of QA pairs to generate. output_path : str, optional Path to save the generated test set (CSV). Returns ------- Any The generated test set. """ documents = get_documents_from_pdfs(pdf_paths) if not documents: logging.error("No documents found to generate testset from.") return None # Configure LLM and Embeddings consistent with the app # Use environment variables for API keys and Base URL (e.g. standard OPENAI_*, or manually set) # Allow overriding the LLM model via env var model_name = os.getenv("TESTSET_LLM_MODEL", "openai/gpt-4o-mini") logging.info(f"Using LLM model: {model_name}") # Prefer OpenRouter when available so generated LLM clients use it by default. _openrouter_key = os.getenv("OPENROUTER_API_KEY") if _openrouter_key: os.environ["OPENAI_API_BASE"] = "https://api.openrouter.ai/v1" os.environ["OPENAI_API_KEY"] = _openrouter_key logging.info( "OpenRouter detected; routing OpenAI calls via %s", os.environ["OPENAI_API_BASE"], ) logging.info( "OPENAI_API_KEY loaded=%s", bool(os.environ.get("OPENAI_API_KEY")), ) # Create LLM clients (will read credentials from environment) generator_llm = ChatOpenAI(model=model_name) # Note: critic_llm would be used for test evaluation if needed in future # critic_llm = ChatOpenAI(model=model_name) embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") # Initialize generator (provide the generator LLM and the embeddings) generator = TestsetGenerator.from_langchain(generator_llm, embeddings) # Split large documents into smaller chunks to avoid exceeding model context limits splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=500) split_docs = splitter.split_documents(documents) # Generate testset (use default query distribution) logging.info( f"Generating testset of size {test_size} from {len(split_docs)} chunks..." ) testset = generator.generate_with_langchain_docs(split_docs, testset_size=test_size) # Export to CSV test_df = testset.to_pandas() test_df.to_csv(output_path, index=False) logging.info(f"Testset saved to {output_path}") return testset if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if len(sys.argv) < 2: print("Usage: python generate_testset.py ...") else: pdf_files = sys.argv[1:] generate_testset(pdf_files)