Spaces:
Running
Running
| """ | |
| Testset Generation module for the RAG system using Ragas. | |
| This script generates question-answer pairs from documents to be used in evaluation. | |
| how to run: | |
| python generate_testset.py <pdf_path1> <pdf_path2> ... | |
| """ | |
| # pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,too-many-return-statements,ungrouped-imports,line-too-long,logging-fstring-interpolation,duplicate-code,too-few-public-methods | |
| import os | |
| import sys | |
| import logging | |
| from typing import List, Any | |
| from PyPDF2 import PdfReader | |
| try: | |
| # Newer langchain versions expose Document in langchain.schema | |
| from langchain.schema import Document | |
| except Exception: | |
| try: | |
| # Older versions used langchain.docstore.document | |
| from langchain.docstore.document import Document | |
| except Exception: | |
| # Minimal fallback Document for environments without langchain | |
| from dataclasses import dataclass | |
| class Document: | |
| page_content: str | |
| metadata: dict | None = None | |
| from ragas.testset.synthesizers.generate import TestsetGenerator | |
| try: | |
| from langchain.chat_models import ChatOpenAI | |
| except Exception: | |
| from langchain_openai import ChatOpenAI | |
| try: | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| except Exception: | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| try: | |
| from langchain.schema import SystemMessage, HumanMessage | |
| except Exception: | |
| # Minimal stand-ins if langchain.schema isn't available | |
| from dataclasses import dataclass | |
| class SystemMessage: | |
| content: str | |
| class HumanMessage: | |
| content: str | |
| def _extract_chat_response(resp) -> str: | |
| """Robust extraction of text from various ChatOpenAI response shapes.""" | |
| try: | |
| # langchain newer: AIMessage with .content | |
| if hasattr(resp, "content"): | |
| return resp.content | |
| # langchain older/other: ChatResult with .generations | |
| if hasattr(resp, "generations"): | |
| gens = resp.generations | |
| # gens may be list[list[Generation]] or list[Generation] | |
| try: | |
| return gens[0][0].text | |
| except Exception: | |
| try: | |
| return gens[0].text | |
| except Exception: | |
| pass | |
| # fallback dict/list shapes | |
| if isinstance(resp, list) and resp: | |
| first = resp[0] | |
| if hasattr(first, "content"): | |
| return first.content | |
| if isinstance(first, dict) and "content" in first: | |
| return first["content"] | |
| if isinstance(resp, dict): | |
| for k in ("content", "text"): | |
| if k in resp: | |
| return resp[k] | |
| except Exception: | |
| pass | |
| return str(resp) | |
| def summarize_documents(docs, llm, max_summary_chars: int = 2000) -> List[Document]: | |
| """Summarize each Document using the provided LLM into shorter Documents. | |
| This is optional and controlled by the `USE_CHUNK_SUMMARIZATION` env var. | |
| """ | |
| summaries: List[Document] = [] | |
| for i, doc in enumerate(docs): | |
| text = (doc.page_content or "").strip() | |
| if not text: | |
| continue | |
| # Construct a concise summarization prompt | |
| prompt = ( | |
| f"Summarize the following text into a concise summary (preserve key facts, numbers, and named entities). " | |
| f"Aim for no more than {max_summary_chars} characters. Return only the summary, no commentary.\n\nText:\n" | |
| + text | |
| ) | |
| try: | |
| messages = [ | |
| SystemMessage(content="You are a concise summarizer."), | |
| HumanMessage(content=prompt), | |
| ] | |
| resp = llm(messages) | |
| summary = _extract_chat_response(resp) | |
| except Exception: | |
| try: | |
| resp = llm(prompt) | |
| summary = _extract_chat_response(resp) | |
| except Exception as e: | |
| logging.debug(f"Summarization failed for chunk {i}: {e}") | |
| # Fallback: truncate | |
| summary = text[:max_summary_chars] | |
| summary = (summary or "").strip() | |
| if not summary: | |
| summary = text[:max_summary_chars] | |
| meta = dict(doc.metadata) if getattr(doc, "metadata", None) else {} | |
| meta.update({"chunk": i}) | |
| summaries.append(Document(page_content=summary, metadata=meta)) | |
| return summaries | |
| # Text splitting to avoid sending huge prompts to the LLM | |
| try: | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| except Exception: | |
| # Minimal fallback splitter if langchain isn't available | |
| class RecursiveCharacterTextSplitter: | |
| def __init__(self, chunk_size: int = 8000, chunk_overlap: int = 500): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| def split_documents(self, docs): | |
| out = [] | |
| for doc in docs: | |
| text = doc.page_content or "" | |
| step = max(1, self.chunk_size - self.chunk_overlap) | |
| for i in range(0, len(text), step): | |
| chunk = text[i : i + self.chunk_size] | |
| out.append(Document(page_content=chunk, metadata=doc.metadata)) | |
| return out | |
| def get_documents_from_pdfs(pdf_paths: List[str]) -> List[Document]: | |
| """ | |
| Load PDFs and convert them to LangChain Document objects. | |
| Parameters | |
| ---------- | |
| pdf_paths : List[str] | |
| List of paths to PDF files. | |
| Returns | |
| ------- | |
| List[Document] | |
| List of LangChain Document objects. | |
| """ | |
| documents = [] | |
| for path in pdf_paths: | |
| try: | |
| reader = PdfReader(path) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text | |
| source = os.path.basename(path) | |
| documents.append(Document(page_content=text, metadata={"source": source})) | |
| except Exception as e: | |
| logging.error(f"Error reading {path}: {e}") | |
| return documents | |
| def generate_testset( | |
| pdf_paths: List[str], test_size: int = 10, output_path: str = "testset.csv" | |
| ) -> Any: | |
| """ | |
| Generate a test set from the given PDFs. | |
| Parameters | |
| ---------- | |
| pdf_paths : List[str] | |
| List of paths to PDF files. | |
| test_size : int, optional | |
| Number of QA pairs to generate. | |
| output_path : str, optional | |
| Path to save the generated test set (CSV). | |
| Returns | |
| ------- | |
| Any | |
| The generated test set. | |
| """ | |
| documents = get_documents_from_pdfs(pdf_paths) | |
| if not documents: | |
| logging.error("No documents found to generate testset from.") | |
| return None | |
| # Configure LLM and Embeddings consistent with the app | |
| # Use environment variables for API keys and Base URL (e.g. standard OPENAI_*, or manually set) | |
| # Allow overriding the LLM model via env var | |
| model_name = os.getenv("TESTSET_LLM_MODEL", "openai/gpt-4o-mini") | |
| logging.info(f"Using LLM model: {model_name}") | |
| # Prefer OpenRouter when available so generated LLM clients use it by default. | |
| _openrouter_key = os.getenv("OPENROUTER_API_KEY") | |
| if _openrouter_key: | |
| os.environ["OPENAI_API_BASE"] = "https://api.openrouter.ai/v1" | |
| os.environ["OPENAI_API_KEY"] = _openrouter_key | |
| logging.info( | |
| "OpenRouter detected; routing OpenAI calls via %s", | |
| os.environ["OPENAI_API_BASE"], | |
| ) | |
| logging.info( | |
| "OPENAI_API_KEY loaded=%s", | |
| bool(os.environ.get("OPENAI_API_KEY")), | |
| ) | |
| # Create LLM clients (will read credentials from environment) | |
| generator_llm = ChatOpenAI(model=model_name) | |
| # Note: critic_llm would be used for test evaluation if needed in future | |
| # critic_llm = ChatOpenAI(model=model_name) | |
| embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") | |
| # Initialize generator (provide the generator LLM and the embeddings) | |
| generator = TestsetGenerator.from_langchain(generator_llm, embeddings) | |
| # Split large documents into smaller chunks to avoid exceeding model context limits | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=500) | |
| split_docs = splitter.split_documents(documents) | |
| # Generate testset (use default query distribution) | |
| logging.info( | |
| f"Generating testset of size {test_size} from {len(split_docs)} chunks..." | |
| ) | |
| testset = generator.generate_with_langchain_docs(split_docs, testset_size=test_size) | |
| # Export to CSV | |
| test_df = testset.to_pandas() | |
| test_df.to_csv(output_path, index=False) | |
| logging.info(f"Testset saved to {output_path}") | |
| return testset | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| if len(sys.argv) < 2: | |
| print("Usage: python generate_testset.py <pdf_path1> <pdf_path2> ...") | |
| else: | |
| pdf_files = sys.argv[1:] | |
| generate_testset(pdf_files) | |