Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| One-time script to generate embeddings for all PDF documents. | |
| Processes PDFs, chunks them, generates embeddings, and saves to JSON. | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| from pathlib import Path | |
| from openai import OpenAI | |
| # Add parent directory to path | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from backend.chunk_processor import ChunkProcessor | |
| from backend.embeddings import EmbeddingSearch | |
| def main(): | |
| """Process all PDFs and generate embeddings.""" | |
| # Configuration | |
| pdf_directory = Path("/Users/jsv/Work/ataya/concert-master/pdfs") | |
| output_directory = Path(__file__).parent.parent / "data" / "embeddings" | |
| # Check API key | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| print("Error: OPENAI_API_KEY environment variable not set") | |
| sys.exit(1) | |
| # Initialize components | |
| client = OpenAI(api_key=api_key) | |
| chunk_processor = ChunkProcessor(chunk_size=1000, overlap=200) | |
| embedding_search = EmbeddingSearch(client) | |
| print(f"Processing PDFs from: {pdf_directory}") | |
| print(f"Output directory: {output_directory}") | |
| # Create output directory | |
| output_directory.mkdir(parents=True, exist_ok=True) | |
| # Process all PDFs by version | |
| version_chunks = chunk_processor.process_directory(pdf_directory) | |
| print(f"\nFound {len(version_chunks)} versions:") | |
| for version, chunks in version_chunks.items(): | |
| print(f" - {version}: {len(chunks)} chunks") | |
| # Generate embeddings for each version | |
| for version, chunks in version_chunks.items(): | |
| print(f"\nGenerating embeddings for {version}...") | |
| # Add embeddings to chunks | |
| for i, chunk in enumerate(chunks): | |
| if i % 10 == 0: | |
| print(f" Processing chunk {i+1}/{len(chunks)}...") | |
| # Generate embedding | |
| embedding = embedding_search.generate_embedding(chunk["text"]) | |
| chunk["embedding"] = embedding | |
| # Add ID | |
| chunk["id"] = f"{version}_{i:04d}" | |
| # Rate limiting (to avoid hitting API limits) | |
| time.sleep(0.1) | |
| # Save to file | |
| output_path = output_directory / f"{version}.json" | |
| with open(output_path, 'w') as f: | |
| json.dump({ | |
| "version": version, | |
| "chunk_count": len(chunks), | |
| "chunk_size": 1000, | |
| "overlap": 200, | |
| "embedding_model": "text-embedding-3-small", | |
| "chunks": chunks | |
| }, f, indent=2) | |
| print(f" Saved {len(chunks)} chunks with embeddings to {output_path}") | |
| print("\n✅ Embedding generation complete!") | |
| # Print summary | |
| total_chunks = sum(len(chunks) for chunks in version_chunks.values()) | |
| print(f"\nSummary:") | |
| print(f" - Total chunks processed: {total_chunks}") | |
| print(f" - Versions created: {len(version_chunks)}") | |
| print(f" - Output directory: {output_directory}") | |
| if __name__ == "__main__": | |
| main() |