openai-chatbot-mcp / scripts /generate_embeddings.py
Julian Vanecek
Removed openai assistant, added local embedding comparisons
ac4560f
#!/usr/bin/env python3
"""
One-time script to generate embeddings for all PDF documents.
Processes PDFs, chunks them, generates embeddings, and saves to JSON.
"""
import os
import sys
import json
import time
from pathlib import Path
from openai import OpenAI
# Add parent directory to path
sys.path.append(str(Path(__file__).parent.parent))
from backend.chunk_processor import ChunkProcessor
from backend.embeddings import EmbeddingSearch
def main():
"""Process all PDFs and generate embeddings."""
# Configuration
pdf_directory = Path("/Users/jsv/Work/ataya/concert-master/pdfs")
output_directory = Path(__file__).parent.parent / "data" / "embeddings"
# Check API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Error: OPENAI_API_KEY environment variable not set")
sys.exit(1)
# Initialize components
client = OpenAI(api_key=api_key)
chunk_processor = ChunkProcessor(chunk_size=1000, overlap=200)
embedding_search = EmbeddingSearch(client)
print(f"Processing PDFs from: {pdf_directory}")
print(f"Output directory: {output_directory}")
# Create output directory
output_directory.mkdir(parents=True, exist_ok=True)
# Process all PDFs by version
version_chunks = chunk_processor.process_directory(pdf_directory)
print(f"\nFound {len(version_chunks)} versions:")
for version, chunks in version_chunks.items():
print(f" - {version}: {len(chunks)} chunks")
# Generate embeddings for each version
for version, chunks in version_chunks.items():
print(f"\nGenerating embeddings for {version}...")
# Add embeddings to chunks
for i, chunk in enumerate(chunks):
if i % 10 == 0:
print(f" Processing chunk {i+1}/{len(chunks)}...")
# Generate embedding
embedding = embedding_search.generate_embedding(chunk["text"])
chunk["embedding"] = embedding
# Add ID
chunk["id"] = f"{version}_{i:04d}"
# Rate limiting (to avoid hitting API limits)
time.sleep(0.1)
# Save to file
output_path = output_directory / f"{version}.json"
with open(output_path, 'w') as f:
json.dump({
"version": version,
"chunk_count": len(chunks),
"chunk_size": 1000,
"overlap": 200,
"embedding_model": "text-embedding-3-small",
"chunks": chunks
}, f, indent=2)
print(f" Saved {len(chunks)} chunks with embeddings to {output_path}")
print("\n✅ Embedding generation complete!")
# Print summary
total_chunks = sum(len(chunks) for chunks in version_chunks.values())
print(f"\nSummary:")
print(f" - Total chunks processed: {total_chunks}")
print(f" - Versions created: {len(version_chunks)}")
print(f" - Output directory: {output_directory}")
if __name__ == "__main__":
main()