#!/usr/bin/env python3 """ One-time script to generate embeddings for all PDF documents. Processes PDFs, chunks them, generates embeddings, and saves to JSON. """ import os import sys import json import time from pathlib import Path from openai import OpenAI # Add parent directory to path sys.path.append(str(Path(__file__).parent.parent)) from backend.chunk_processor import ChunkProcessor from backend.embeddings import EmbeddingSearch def main(): """Process all PDFs and generate embeddings.""" # Configuration pdf_directory = Path("/Users/jsv/Work/ataya/concert-master/pdfs") output_directory = Path(__file__).parent.parent / "data" / "embeddings" # Check API key api_key = os.getenv("OPENAI_API_KEY") if not api_key: print("Error: OPENAI_API_KEY environment variable not set") sys.exit(1) # Initialize components client = OpenAI(api_key=api_key) chunk_processor = ChunkProcessor(chunk_size=1000, overlap=200) embedding_search = EmbeddingSearch(client) print(f"Processing PDFs from: {pdf_directory}") print(f"Output directory: {output_directory}") # Create output directory output_directory.mkdir(parents=True, exist_ok=True) # Process all PDFs by version version_chunks = chunk_processor.process_directory(pdf_directory) print(f"\nFound {len(version_chunks)} versions:") for version, chunks in version_chunks.items(): print(f" - {version}: {len(chunks)} chunks") # Generate embeddings for each version for version, chunks in version_chunks.items(): print(f"\nGenerating embeddings for {version}...") # Add embeddings to chunks for i, chunk in enumerate(chunks): if i % 10 == 0: print(f" Processing chunk {i+1}/{len(chunks)}...") # Generate embedding embedding = embedding_search.generate_embedding(chunk["text"]) chunk["embedding"] = embedding # Add ID chunk["id"] = f"{version}_{i:04d}" # Rate limiting (to avoid hitting API limits) time.sleep(0.1) # Save to file output_path = output_directory / f"{version}.json" with open(output_path, 'w') as f: json.dump({ "version": version, "chunk_count": len(chunks), "chunk_size": 1000, "overlap": 200, "embedding_model": "text-embedding-3-small", "chunks": chunks }, f, indent=2) print(f" Saved {len(chunks)} chunks with embeddings to {output_path}") print("\n✅ Embedding generation complete!") # Print summary total_chunks = sum(len(chunks) for chunks in version_chunks.values()) print(f"\nSummary:") print(f" - Total chunks processed: {total_chunks}") print(f" - Versions created: {len(version_chunks)}") print(f" - Output directory: {output_directory}") if __name__ == "__main__": main()